diff options
author | ro <ro@138bc75d-0d04-0410-961f-82ee72b054a4> | 2011-11-02 15:03:19 +0000 |
---|---|---|
committer | ro <ro@138bc75d-0d04-0410-961f-82ee72b054a4> | 2011-11-02 15:03:19 +0000 |
commit | 9213d2eb44a8b9bcc432b57e246d9b52d5bdc949 (patch) | |
tree | bfbde9a54f663fb7556b9dacd07709ef97c1961c /libgcc | |
parent | 237490bf10db39b859bd28598ff64f1bd2c84421 (diff) | |
download | ppe42-gcc-9213d2eb44a8b9bcc432b57e246d9b52d5bdc949.tar.gz ppe42-gcc-9213d2eb44a8b9bcc432b57e246d9b52d5bdc949.zip |
Move libgcc1 to toplevel libgcc
gcc:
* Makefile.in (LIB1ASMSRC): Don't export.
(libgcc.mvars): Don't emit LIB1ASMFUNCS, LIB1ASMSRC.
* config/arm/arm.c: Update lib1funcs.asm filename.
* config/arm/linux-eabi.h: Likewise.
* config/arm/bpabi-v6m.S, config/arm/bpabi.S,
config/arm/ieee754-df.S, config/arm/ieee754-sf.S: Move to
../libgcc/config/arm.
* config/arm/lib1funcs.asm: Move to ../libgcc/config/arm/lib1funcs.S.
* config/arm/t-arm (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/arm/t-arm-elf (LIB1ASMFUNCS): Remove.
* config/arm/t-bpabi: Likewise.
* config/arm/t-linux (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/arm/t-linux-eabi (LIB1ASMFUNCS): Remove.
* config/arm/t-strongarm-elf: Likewise.
* config/arm/t-symbian: Likewise.
* config/arm/t-vxworks: Likewise.
* config/arm/t-wince-pe: Likewise.
* config/avr/libgcc.S: Move to ../libgcc/config/avr.
* config/avr/t-avr (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/bfin/lib1funcs.asm: Move to
../libgcc/config/bfin/lib1funcs.S.
* config/bfin/t-bfin: Remove.
* config/bfin/t-bfin-elf (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/bfin/t-bfin-linux: Likewise.
* config/bfin/t-bfin-uclinux: Likewise.
* config/c6x/lib1funcs.asm: Move to
../libgcc/config/c6x/lib1funcs.S.
* config/c6x/t-c6x-elf (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/fr30/lib1funcs.asm: Move to
../libgcc/config/fr30/lib1funcs.S.
* config/fr30/t-fr30 (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/frv/lib1funcs.asm: Move to
../libgcc/config/frv/lib1funcs.S.
* config/frv/t-frv (CROSS_LIBGCC1, LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/h8300/fixunssfsi.c: Update lib1funcs.asm filename.
* config/h8300/lib1funcs.asm: Move to
../libgcc/config/h8300/lib1funcs.S.
* config/h8300/t-h8300 (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/i386/cygwin.asm: Move to ../libgcc/config/i386/cygwin.S.
* config/i386/t-cygming (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/i386/t-interix: Likewise.
* config/ia64/lib1funcs.asm: Move to
../libgcc/config/ia64/lib1funcs.S.
* config/ia64/t-hpux (LIB1ASMFUNCS, LIBGCC1_TEST): Remove.
* config/ia64/t-ia64 (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/iq2000/t-iq2000 (LIBGCC1, CROSS_LIBGCC1): Remove.
* config/m32c/m32c.c: Update m32c-lib1.S filename.
* config/m32c/m32c-lib1.S: Move to ../libgcc/config/m32c/lib1funcs.S.
* config/m32c/t-m32c (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/m32r/t-linux (CROSS_LIBGCC1, LIBGCC1, LIBGCC1_TEST): Remove.
* config/m68k/lb1sf68.asm: Move to ../libgcc/config/m68k/lb1sf68.S.
* config/m68k/t-floatlib (LIB1ASMSRC, LIB1ASMFUNCS): New file.
* config/mcore/lib1.asm: Move to ../libgcc/config/mcore/lib1funcs.S.
* config/mcore/t-mcore (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/mep/mep-lib1.asm: Move to ../libgcc/config/mep/lib1funcs.S.
* config/mep/t-mep (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/mips/mips16.S: Move to ../libgcc/config/mips.
* config/mips/t-libgcc-mips16: Remove.
* config/mips/t-sr71k (LIBGCC1, CROSS_LIBGCC1): Remove.
* config/pa/milli64.S: Move to ../libgcc/config/pa.
* config/pa/t-linux (LIB1ASMFUNCS, LIB1ASMSRC): Remove.
* config/pa/t-linux64: Likewise.
* config/picochip/libgccExtras/fake_libgcc.asm: Move to
../libgcc/config/picochip/lib1funcs.S.
* config/picochip/t-picochip (LIB1ASMFUNCS, LIB1ASMSRC): Remove.
* config/sh/lib1funcs.asm: Move to ../libgcc/config/sh/lib1funcs.S.
* config/sh/lib1funcs.h: Move to ../libgcc/config/sh.
* config/sh/sh.h: Update lib1funcs.asm filename.
* config/sh/t-linux (LIB1ASMFUNCS_CACHE): Remove.
* config/sh/t-netbsd: Likewise.
* config/sh/t-sh (LIB1ASMSRC, LIB1ASMFUNCS, LIB1ASMFUNCS_CACHE):
Remove.
* config/sh/t-sh64 (LIB1ASMFUNCS): Remove.
* config/sparc/lb1spc.asm: Move to ../libgcc/config/sparc/lb1spc.S.
* config/sparc/lb1spl.asm: Remove.
* config/sparc/t-elf (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config/sparc/t-leon: Likewise.
* config/spu/t-spu-elf (LIBGCC1, CROSS_LIBGCC1): Remove.
* config/v850/lib1funcs.asm: Move to ../libgcc/config/v850/lib1funcs.S.
* config/v850/t-v850 (LIB1ASMSRC, LIB1ASMFUNCS): Remove
* config/vax/lib1funcs.asm: Move to ../libgcc/config/vax/lib1funcs.S.
* config/vax/t-linux: Remove.
* config/xtensa/ieee754-df.S, config/xtensa/ieee754-sf.S: Move to
../libgcc/config/xtensa.
* config/xtensa/lib1funcs.asm: Move to
../libgcc/config/xtensa/lib1funcs.S.
* config/xtensa/t-xtensa (LIB1ASMSRC, LIB1ASMFUNCS): Remove.
* config.gcc (bfin*-rtems*): Remove bfin/t-bfin from tmake_file.
(bfin*-*): Likewise.
(mips64*-*-linux*, mipsisa64*-*-linux*): Remove
mips/t-libgcc-mips16 from tmake_file.
(mips*-*-linux*): Likewise.
(mips*-sde-elf*): Likewise.
(mipsisa32-*-elf*, mipsisa32el-*-elf*, mipsisa32r2-*-elf*)
(mipsisa32r2el-*-elf*, mipsisa64-*-elf*, mipsisa64el-*-elf*)
(mipsisa64r2-*-elf*, mipsisa64r2el-*-elf*): Likewise.
(mipsisa64sb1-*-elf*, mipsisa64sb1el-*-elf*): Likewise.
(mips-*-elf*, mipsel-*-elf*): Likewise.
(mips64-*-elf*, mips64el-*-elf*): Likewise.
(mips64orion-*-elf*, mips64orionel-*-elf*): Likewise.
(mips*-*-rtems*): Likewise.
(mipstx39-*-elf*, mipstx39el-*-elf*): Likewise.
(vax-*-linux*): Remove vax/t-linux from tmake_file.
libgcc:
* Makefile.in ($(lib1asmfuncs-o), $(lib1asmfuncs-s-o)): Use
$(srcdir) to refer to $(LIB1ASMSRC).
Use $<.
* config/arm/bpabi-v6m.S, config/arm/bpabi.S,
config/arm/ieee754-df.S, config/arm/ieee754-sf.S,
config/arm/lib1funcs.S: New files.
* config/arm/libunwind.S [!__symbian__]: Use lib1funcs.S.
* config/arm/t-arm: New file.
* config/arm/t-bpabi (LIB1ASMFUNCS): Set.
* config/arm/t-elf, config/arm/t-linux, config/arm/t-linux-eabi,
config/arm/t-strongarm-elf: New files.
* config/arm/t-symbian (LIB1ASMFUNCS): Set.
* config/arm/t-vxworks, config/arm/t-wince-pe: New files.
* config/avr/lib1funcs.S: New file.
* config/avr/t-avr (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/bfin/lib1funcs.S, config/bfin/t-bfin: New files.
* config/c6x/lib1funcs.S: New file.
* config/c6x/t-elf (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/fr30/lib1funcs.S, config/fr30/t-fr30: New files.
* config/frv/lib1funcs.S: New file.
* config/frv/t-frv (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/h8300/lib1funcs.S, config/h8300/t-h8300: New files.
* config/i386/cygwin.S, config/i386/t-chkstk: New files.
* config/ia64/__divxf3.asm: Rename to ...
* config/ia64/__divxf3.S: ... this.
Adapt lib1funcs.asm filename.
* config/ia64/_fixtfdi.asm: Rename to ...
* config/ia64/_fixtfdi.S: ... this.
Adapt lib1funcs.asm filename.
* config/ia64/_fixunstfdi.asm: Rename to ...
* config/ia64/_fixunstfdi.S: ... this.
Adapt lib1funcs.asm filename.
* config/ia64/_floatditf.asm: Rename to ...
* config/ia64/_floatditf.S: ... this.
Adapt lib1funcs.asm filename.
* config/ia64/lib1funcs.S: New file.
* config/ia64/t-hpux (LIB1ASMFUNCS): Set.
* config/ia64/t-ia64 (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/ia64/t-softfp-compat (libgcc1-tf-compats): Adapt suffix.
* config/m32c/lib1funcs.S, config/m32c/t-m32c: New files.
* config/m68k/lb1sf68.S, config/m68k/t-floatlib: New files.
* config/mcore/lib1funcs.S, config/mcore/t-mcore: New files.
* config/mep/lib1funcs.S: New file.
* config/mep/t-mep (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/mips/mips16.S: New file.
* config/mips/t-mips16 (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/pa/milli64.S: New file.
* config/pa/t-linux, config/pa/t-linux64: New files.
* config/picochip/lib1funcs.S: New file.
* config/picochip/t-picochip (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config/sh/lib1funcs.S, config/sh/lib1funcs.h: New files.
* config/sh/t-linux (LIB1ASMFUNCS_CACHE): Set.
* config/sh/t-netbsd: New file.
* config/sh/t-sh (LIB1ASMSRC, LIB1ASMFUNCS, LIB1ASMFUNCS_CACHE): Set.
Use $(srcdir) to refer to lib1funcs.S, adapt filename.
* config/sh/t-sh64: New file.
* config/sparc/lb1spc.S: New file.
* config/sparc/t-softmul (LIB1ASMSRC): Adapt sparc/lb1spc.asm
filename.
* config/v850/lib1funcs.S, config/v850/t-v850: New files.
* config/vax/lib1funcs.S, config/vax/t-linux: New files.
* config/xtensa/ieee754-df.S, config/xtensa/ieee754-sf.S,
config/xtensa/lib1funcs.S: New files.
* config/xtensa/t-xtensa (LIB1ASMSRC, LIB1ASMFUNCS): Set.
* config.host (arm-wrs-vxworks): Add arm/t-arm, arm/t-vxworks to
tmake_file.
(arm*-*-freebsd*): Add arm/t-arm, arm/t-strongarm-elf to tmake_file.
(arm*-*-netbsdelf*): Add arm/t-arm to tmake_file.
(arm*-*-linux*): Likewise.
Add arm/t-elf, arm/t-bpabi, arm/t-linux-eabi to tmake_file for
arm*-*-linux-*eabi, add arm/t-linux otherwise.
(arm*-*-uclinux*): Add arm/t-arm, arm/t-elf to tmake_file.
(arm*-*-ecos-elf): Likewise.
(arm*-*-eabi*, arm*-*-symbianelf*): Likewise.
(arm*-*-rtems*): Likewise.
(arm*-*-elf): Likewise.
(arm*-wince-pe*): Add arm/t-arm, arm/t-wince-pe to tmake_file.
(avr-*-rtems*): Add to tmake_file, add avr/t-avr.
(bfin*-elf*): Add bfin/t-bfin to tmake_file.
(bfin*-uclinux*): Likewise.
(bfin*-linux-uclibc*): Likewise.
(bfin*-rtems*): Likewise.
(bfin*-*): Likewise.
(fido-*-elf): Merge into m68k-*-elf*.
(fr30-*-elf)): Add fr30/t-fr30 to tmake_file.
(frv-*-*linux*): Add frv/t-frv to tmake_file.
(h8300-*-rtems*): Add h8300/t-h8300 to tmake_file.
(h8300-*-elf*): Likewise.
(hppa*64*-*-linux*): Add pa/t-linux, pa/t-linux64 to tmake_file.
(hppa*-*-linux*): Add pa/t-linux to tmake_file.
(i[34567]86-*-cygwin*): Add i386/t-chkstk to tmake_file.
(i[34567]86-*-mingw*): Likewise.
(x86_64-*-mingw*): Likewise.
(i[34567]86-*-interix3*): Likewise.
(ia64*-*-hpux*): Add ia64/t-ia64, ia64/t-hpux to tmake_file.
(ia64-hp-*vms*): Add ia64/t-ia64 to tmake_file.
(m68k-*-elf*): Also handle fido-*-elf.
Add m68k/t-floatlib to tmake_file.
(m68k-*-uclinux*): Add m68k/t-floatlib to tmake_file.
(m68k-*-linux*): Likewise.
(m68k-*-rtems*): Likewise.
(mcore-*-elf): Add mcore/t-mcore to tmake_file.
(sh-*-elf*, sh[12346l]*-*-elf*): Add sh/t-sh64 to tmake_file for
sh64*-*-*.
(sh-*-linux*, sh[2346lbe]*-*-linux*): Add sh/t-sh to tmake_file.
Add sh/t-sh64 to tmake_file for sh64*-*-linux*.
(sh-*-netbsdelf*, shl*-*-netbsdelf*, sh5-*-netbsd*)
(sh5l*-*-netbsd*, sh64-*-netbsd*, sh64l*-*-netbsd*): Add sh/t-sh,
sh/t-netbsd to tmake_file.
Add sh/t-sh64 to tmake_file for sh5*-*-netbsd*, sh64*-netbsd*.
(sh-*-rtems*): Add sh/t-sh to tmake_file.
(sh-wrs-vxworks): Likewise.
(sparc-*-linux*): Add sparc/t-softmul to tmake_file except for
*-leon[3-9]*.
(v850*-*-*): Add v850/t-v850 to tmake_file.
(vax-*-linux*): Add vax/t-linux to tmake_file.
(m32c-*-elf*, m32c-*-rtems*): Add m32c/t-m32c to tmake_file.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@180773 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libgcc')
72 files changed, 29534 insertions, 55 deletions
diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog index b5d9c243a98..6b2514aba9a 100644 --- a/libgcc/ChangeLog +++ b/libgcc/ChangeLog @@ -1,5 +1,125 @@ 2011-11-02 Rainer Orth <ro@CeBiTec.Uni-Bielefeld.DE> + * Makefile.in ($(lib1asmfuncs-o), $(lib1asmfuncs-s-o)): Use + $(srcdir) to refer to $(LIB1ASMSRC). + Use $<. + * config/arm/bpabi-v6m.S, config/arm/bpabi.S, + config/arm/ieee754-df.S, config/arm/ieee754-sf.S, + config/arm/lib1funcs.S: New files. + * config/arm/libunwind.S [!__symbian__]: Use lib1funcs.S. + * config/arm/t-arm: New file. + * config/arm/t-bpabi (LIB1ASMFUNCS): Set. + * config/arm/t-elf, config/arm/t-linux, config/arm/t-linux-eabi, + config/arm/t-strongarm-elf: New files. + * config/arm/t-symbian (LIB1ASMFUNCS): Set. + * config/arm/t-vxworks, config/arm/t-wince-pe: New files. + * config/avr/lib1funcs.S: New file. + * config/avr/t-avr (LIB1ASMSRC, LIB1ASMFUNCS): Set. + * config/bfin/lib1funcs.S, config/bfin/t-bfin: New files. + * config/c6x/lib1funcs.S: New file. + * config/c6x/t-elf (LIB1ASMSRC, LIB1ASMFUNCS): Set. + * config/fr30/lib1funcs.S, config/fr30/t-fr30: New files. + * config/frv/lib1funcs.S: New file. + * config/frv/t-frv (LIB1ASMSRC, LIB1ASMFUNCS): Set. + * config/h8300/lib1funcs.S, config/h8300/t-h8300: New files. + * config/i386/cygwin.S, config/i386/t-chkstk: New files. + * config/ia64/__divxf3.asm: Rename to ... + * config/ia64/__divxf3.S: ... this. + Adapt lib1funcs.asm filename. + * config/ia64/_fixtfdi.asm: Rename to ... + * config/ia64/_fixtfdi.S: ... this. + Adapt lib1funcs.asm filename. + * config/ia64/_fixunstfdi.asm: Rename to ... + * config/ia64/_fixunstfdi.S: ... this. + Adapt lib1funcs.asm filename. + * config/ia64/_floatditf.asm: Rename to ... + * config/ia64/_floatditf.S: ... this. + Adapt lib1funcs.asm filename. + * config/ia64/lib1funcs.S: New file. + * config/ia64/t-hpux (LIB1ASMFUNCS): Set. + * config/ia64/t-ia64 (LIB1ASMSRC, LIB1ASMFUNCS): Set. + * config/ia64/t-softfp-compat (libgcc1-tf-compats): Adapt suffix. + * config/m32c/lib1funcs.S, config/m32c/t-m32c: New files. + * config/m68k/lb1sf68.S, config/m68k/t-floatlib: New files. + * config/mcore/lib1funcs.S, config/mcore/t-mcore: New files. + * config/mep/lib1funcs.S: New file. + * config/mep/t-mep (LIB1ASMSRC, LIB1ASMFUNCS): Set. + * config/mips/mips16.S: New file. + * config/mips/t-mips16 (LIB1ASMSRC, LIB1ASMFUNCS): Set. + * config/pa/milli64.S: New file. + * config/pa/t-linux, config/pa/t-linux64: New files. + * config/picochip/lib1funcs.S: New file. + * config/picochip/t-picochip (LIB1ASMSRC, LIB1ASMFUNCS): Set. + * config/sh/lib1funcs.S, config/sh/lib1funcs.h: New files. + * config/sh/t-linux (LIB1ASMFUNCS_CACHE): Set. + * config/sh/t-netbsd: New file. + * config/sh/t-sh (LIB1ASMSRC, LIB1ASMFUNCS, LIB1ASMFUNCS_CACHE): Set. + Use $(srcdir) to refer to lib1funcs.S, adapt filename. + * config/sh/t-sh64: New file. + * config/sparc/lb1spc.S: New file. + * config/sparc/t-softmul (LIB1ASMSRC): Adapt sparc/lb1spc.asm + filename. + * config/v850/lib1funcs.S, config/v850/t-v850: New files. + * config/vax/lib1funcs.S, config/vax/t-linux: New files. + * config/xtensa/ieee754-df.S, config/xtensa/ieee754-sf.S, + config/xtensa/lib1funcs.S: New files. + * config/xtensa/t-xtensa (LIB1ASMSRC, LIB1ASMFUNCS): Set. + * config.host (arm-wrs-vxworks): Add arm/t-arm, arm/t-vxworks to + tmake_file. + (arm*-*-freebsd*): Add arm/t-arm, arm/t-strongarm-elf to tmake_file. + (arm*-*-netbsdelf*): Add arm/t-arm to tmake_file. + (arm*-*-linux*): Likewise. + Add arm/t-elf, arm/t-bpabi, arm/t-linux-eabi to tmake_file for + arm*-*-linux-*eabi, add arm/t-linux otherwise. + (arm*-*-uclinux*): Add arm/t-arm, arm/t-elf to tmake_file. + (arm*-*-ecos-elf): Likewise. + (arm*-*-eabi*, arm*-*-symbianelf*): Likewise. + (arm*-*-rtems*): Likewise. + (arm*-*-elf): Likewise. + (arm*-wince-pe*): Add arm/t-arm, arm/t-wince-pe to tmake_file. + (avr-*-rtems*): Add to tmake_file, add avr/t-avr. + (bfin*-elf*): Add bfin/t-bfin to tmake_file. + (bfin*-uclinux*): Likewise. + (bfin*-linux-uclibc*): Likewise. + (bfin*-rtems*): Likewise. + (bfin*-*): Likewise. + (fido-*-elf): Merge into m68k-*-elf*. + (fr30-*-elf)): Add fr30/t-fr30 to tmake_file. + (frv-*-*linux*): Add frv/t-frv to tmake_file. + (h8300-*-rtems*): Add h8300/t-h8300 to tmake_file. + (h8300-*-elf*): Likewise. + (hppa*64*-*-linux*): Add pa/t-linux, pa/t-linux64 to tmake_file. + (hppa*-*-linux*): Add pa/t-linux to tmake_file. + (i[34567]86-*-cygwin*): Add i386/t-chkstk to tmake_file. + (i[34567]86-*-mingw*): Likewise. + (x86_64-*-mingw*): Likewise. + (i[34567]86-*-interix3*): Likewise. + (ia64*-*-hpux*): Add ia64/t-ia64, ia64/t-hpux to tmake_file. + (ia64-hp-*vms*): Add ia64/t-ia64 to tmake_file. + (m68k-*-elf*): Also handle fido-*-elf. + Add m68k/t-floatlib to tmake_file. + (m68k-*-uclinux*): Add m68k/t-floatlib to tmake_file. + (m68k-*-linux*): Likewise. + (m68k-*-rtems*): Likewise. + (mcore-*-elf): Add mcore/t-mcore to tmake_file. + (sh-*-elf*, sh[12346l]*-*-elf*): Add sh/t-sh64 to tmake_file for + sh64*-*-*. + (sh-*-linux*, sh[2346lbe]*-*-linux*): Add sh/t-sh to tmake_file. + Add sh/t-sh64 to tmake_file for sh64*-*-linux*. + (sh-*-netbsdelf*, shl*-*-netbsdelf*, sh5-*-netbsd*) + (sh5l*-*-netbsd*, sh64-*-netbsd*, sh64l*-*-netbsd*): Add sh/t-sh, + sh/t-netbsd to tmake_file. + Add sh/t-sh64 to tmake_file for sh5*-*-netbsd*, sh64*-netbsd*. + (sh-*-rtems*): Add sh/t-sh to tmake_file. + (sh-wrs-vxworks): Likewise. + (sparc-*-linux*): Add sparc/t-softmul to tmake_file except for + *-leon[3-9]*. + (v850*-*-*): Add v850/t-v850 to tmake_file. + (vax-*-linux*): Add vax/t-linux to tmake_file. + (m32c-*-elf*, m32c-*-rtems*): Add m32c/t-m32c to tmake_file. + +2011-11-02 Rainer Orth <ro@CeBiTec.Uni-Bielefeld.DE> + * crtstuff.c: New file. * Makefile.in (CRTSTUFF_CFLAGS): Define. (CRTSTUFF_T_CFLAGS): Define. diff --git a/libgcc/Makefile.in b/libgcc/Makefile.in index 467901b057a..6bbb369f8e8 100644 --- a/libgcc/Makefile.in +++ b/libgcc/Makefile.in @@ -394,25 +394,22 @@ LIB2_DIVMOD_FUNCS := $(filter-out $(LIB2FUNCS_EXCLUDE) $(LIB1ASMFUNCS), \ ifeq ($(enable_shared),yes) lib1asmfuncs-o = $(patsubst %,%$(objext),$(LIB1ASMFUNCS)) -$(lib1asmfuncs-o): %$(objext): $(gcc_srcdir)/config/$(LIB1ASMSRC) %.vis - $(gcc_compile) -DL$* -xassembler-with-cpp \ - -c $(gcc_srcdir)/config/$(LIB1ASMSRC) -include $*.vis +$(lib1asmfuncs-o): %$(objext): $(srcdir)/config/$(LIB1ASMSRC) %.vis + $(gcc_compile) -DL$* -xassembler-with-cpp -c $< -include $*.vis $(patsubst %,%.vis,$(LIB1ASMFUNCS)): %.vis: %_s$(objext) $(gen-hide-list) libgcc-objects += $(lib1asmfuncs-o) lib1asmfuncs-s-o = $(patsubst %,%_s$(objext),$(LIB1ASMFUNCS)) -$(lib1asmfuncs-s-o): %_s$(objext): $(gcc_srcdir)/config/$(LIB1ASMSRC) - $(gcc_s_compile) -DL$* -xassembler-with-cpp \ - -c $(gcc_srcdir)/config/$(LIB1ASMSRC) +$(lib1asmfuncs-s-o): %_s$(objext): $(srcdir)/config/$(LIB1ASMSRC) + $(gcc_s_compile) -DL$* -xassembler-with-cpp -c $< libgcc-s-objects += $(lib1asmfuncs-s-o) else lib1asmfuncs-o = $(patsubst %,%$(objext),$(LIB1ASMFUNCS)) -$(lib1asmfuncs-o): %$(objext): $(gcc_srcdir)/config/$(LIB1ASMSRC) - $(gcc_compile) -DL$* -xassembler-with-cpp \ - -c $(gcc_srcdir)/config/$(LIB1ASMSRC) +$(lib1asmfuncs-o): %$(objext): $(srcdir)/config/$(LIB1ASMSRC) + $(gcc_compile) -DL$* -xassembler-with-cpp -c $< libgcc-objects += $(lib1asmfuncs-o) endif diff --git a/libgcc/config.host b/libgcc/config.host index 01e2f21a797..0a05ea184b0 100644 --- a/libgcc/config.host +++ b/libgcc/config.host @@ -306,22 +306,25 @@ alpha*-dec-*vms*) md_unwind_header=alpha/vms-unwind.h ;; arm-wrs-vxworks) - tmake_file="$tmake_file t-fdpbit" + tmake_file="$tmake_file arm/t-arm arm/t-vxworks t-fdpbit" extra_parts="$extra_parts crti.o crtn.o" ;; arm*-*-freebsd*) - tmake_file="$tmake_file t-fdpbit" + tmake_file="$tmake_file arm/t-arm arm/t-strongarm-elf t-fdpbit" ;; arm*-*-netbsdelf*) - tmake_file="$tmake_file t-slibgcc-gld-nover" + tmake_file="$tmake_file arm/t-arm t-slibgcc-gld-nover" ;; arm*-*-linux*) # ARM GNU/Linux with ELF - tmake_file="${tmake_file} t-fixedpoint-gnu-prefix" + tmake_file="${tmake_file} arm/t-arm t-fixedpoint-gnu-prefix" case ${host} in arm*-*-linux-*eabi) - tmake_file="${tmake_file} arm/t-bpabi t-slibgcc-libgcc" + tmake_file="${tmake_file} arm/t-elf arm/t-bpabi arm/t-linux-eabi t-slibgcc-libgcc" unwind_header=config/arm/unwind-arm.h ;; + *) + tmake_file="$tmake_file arm/t-linux" + ;; esac tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" ;; @@ -333,15 +336,15 @@ arm*-*-uclinux*) # ARM ucLinux unwind_header=config/arm/unwind-arm.h ;; esac - tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" + tmake_file="$tmake_file arm/t-arm arm/t-elf t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" extra_parts="$extra_parts crti.o crtn.o" ;; arm*-*-ecos-elf) - tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" + tmake_file="$tmake_file arm/t-arm arm/t-elf t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" extra_parts="$extra_parts crti.o crtn.o" ;; arm*-*-eabi* | arm*-*-symbianelf* ) - tmake_file="${tmake_file} t-fixedpoint-gnu-prefix" + tmake_file="${tmake_file} arm/t-arm arm/t-elf t-fixedpoint-gnu-prefix" case ${host} in arm*-*-eabi*) tmake_file="${tmake_file} arm/t-bpabi" @@ -356,17 +359,18 @@ arm*-*-eabi* | arm*-*-symbianelf* ) unwind_header=config/arm/unwind-arm.h ;; arm*-*-rtems*) - tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" + tmake_file="$tmake_file arm/t-arm arm/t-elf t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" extra_parts="$extra_parts crti.o crtn.o" ;; arm*-*-elf) - tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" + tmake_file="$tmake_file arm/t-arm arm/t-elf t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp" extra_parts="$extra_parts crti.o crtn.o" ;; arm*-wince-pe*) + tmake_file="$tmake_file arm/t-arm arm/t-wince-pe" ;; avr-*-rtems*) - tmake_file=t-fpbit + tmake_file="$tmake_file avr/t-avr t-fpbit" # Don't use default. extra_parts= ;; @@ -375,27 +379,27 @@ avr-*-*) tmake_file="${cpu_type}/t-avr t-fpbit" ;; bfin*-elf*) - tmake_file="bfin/t-crtlibid bfin/t-crtstuff t-fdpbit" + tmake_file="bfin/t-bfin bfin/t-crtlibid bfin/t-crtstuff t-fdpbit" extra_parts="$extra_parts crtbeginS.o crtendS.o crti.o crtn.o crtlibid.o" ;; bfin*-uclinux*) - tmake_file="bfin/t-crtlibid bfin/t-crtstuff t-fdpbit" + tmake_file="bfin/t-bfin bfin/t-crtlibid bfin/t-crtstuff t-fdpbit" extra_parts="$extra_parts crtbeginS.o crtendS.o crtlibid.o" md_unwind_header=bfin/linux-unwind.h ;; bfin*-linux-uclibc*) - tmake_file="$tmake_file bfin/t-crtstuff t-fdpbit bfin/t-linux" + tmake_file="$tmake_file bfin/t-bfin bfin/t-crtstuff t-fdpbit bfin/t-linux" # No need to build crtbeginT.o on uClibc systems. Should probably # be moved to the OS specific section above. extra_parts="crtbegin.o crtbeginS.o crtend.o crtendS.o" md_unwind_header=bfin/linux-unwind.h ;; bfin*-rtems*) - tmake_file="$tmake_file t-fdpbit" + tmake_file="$tmake_file bfin/t-bfin t-fdpbit" extra_parts="$extra_parts crti.o crtn.o" ;; bfin*-*) - tmake_file="$tmake_file t-fdpbit" + tmake_file="$tmake_file bfin/t-bfin t-fdpbit" extra_parts="crtbegin.o crtend.o crti.o crtn.o" ;; crisv32-*-elf) @@ -415,10 +419,8 @@ cris-*-none) cris-*-linux* | crisv32-*-linux*) tmake_file="$tmake_file t-fdpbit cris/t-linux" ;; -fido-*-elf) - ;; fr30-*-elf) - tmake_file="$tmake_file t-fdpbit" + tmake_file="$tmake_file fr30/t-fr30 t-fdpbit" extra_parts="$extra_parts crti.o crtn.o" ;; frv-*-elf) @@ -427,20 +429,21 @@ frv-*-elf) extra_parts="frvbegin.o frvend.o" ;; frv-*-*linux*) - tmake_file="$tmake_file t-fdpbit frv/t-linux" + tmake_file="$tmake_file frv/t-frv frv/t-linux t-fdpbit" ;; h8300-*-rtems*) - tmake_file="$tmake_file t-fpbit" + tmake_file="$tmake_file h8300/t-h8300 t-fpbit" extra_parts="$extra_parts crti.o crtn.o" ;; h8300-*-elf*) - tmake_file="$tmake_file t-fpbit" + tmake_file="$tmake_file h8300/t-h8300 t-fpbit" extra_parts="$extra_parts crti.o crtn.o" ;; hppa*64*-*-linux*) + tmake_file="$tmake_file pa/t-linux pa/t-linux64" ;; hppa*-*-linux*) - tmake_file="$tmake_file t-slibgcc-libgcc" + tmake_file="$tmake_file pa/t-linux t-slibgcc-libgcc" # Set the libgcc version number if test x$enable_sjlj_exceptions = xyes; then tmake_file="$tmake_file pa/t-slibgcc-sjlj-ver" @@ -565,7 +568,7 @@ i[34567]86-*-cygwin*) else tmake_dlldir_file="i386/t-dlldir-x" fi - tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-cygming i386/t-cygwin i386/t-crtfm t-dfprules" + tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-cygming i386/t-cygwin i386/t-crtfm i386/t-chkstk t-dfprules" case ${target_thread_file} in posix) tmake_file="i386/t-mingw-pthread $tmake_file" @@ -586,7 +589,7 @@ i[34567]86-*-mingw*) else tmake_dlldir_file="i386/t-dlldir-x" fi - tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-cygming i386/t-mingw32 i386/t-crtfm t-dfprules" + tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-cygming i386/t-mingw32 i386/t-crtfm i386/t-chkstk t-dfprules" md_unwind_header=i386/w32-unwind.h ;; x86_64-*-mingw*) @@ -602,10 +605,11 @@ x86_64-*-mingw*) else tmake_dlldir_file="i386/t-dlldir-x" fi - tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-mingw32 t-dfprules i386/t-crtfm" + tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-mingw32 t-dfprules i386/t-crtfm i386/t-chkstk" extra_parts="$extra_parts crtfastmath.o" ;; i[34567]86-*-interix3*) + tmake_file="$tmake_file i386/t-chkstk" ;; ia64*-*-elf*) extra_parts="$extra_parts crtbeginS.o crtendS.o crtfastmath.o" @@ -625,10 +629,10 @@ ia64*-*-linux*) md_unwind_header=ia64/linux-unwind.h ;; ia64*-*-hpux*) - tmake_file="ia64/t-hpux t-slibgcc ia64/t-slibgcc-hpux t-slibgcc-hpux" + tmake_file="ia64/t-ia64 ia64/t-hpux t-slibgcc ia64/t-slibgcc-hpux t-slibgcc-hpux" ;; ia64-hp-*vms*) - tmake_file="$tmake_file ia64/t-eh-ia64 ia64/t-vms t-slibgcc-vms" + tmake_file="$tmake_file ia64/t-ia64 ia64/t-eh-ia64 ia64/t-vms t-slibgcc-vms" extra_parts="$extra_parts crtinitS.o" md_unwind_header=ia64/vms-unwind.h ;; @@ -660,18 +664,21 @@ m32r-*-linux*) m32rle-*-linux*) tmake_file="$tmake_file m32r/t-linux t-fdpbit" ;; -m68k-*-elf*) +m68k-*-elf* | fido-*-elf) + tmake_file="$tmake_file m68k/t-floatlib" ;; m68k*-*-netbsdelf*) ;; m68k*-*-openbsd*) ;; m68k-*-uclinux*) # Motorola m68k/ColdFire running uClinux with uClibc + tmake_file="$tmake_file m68k/t-floatlib" md_unwind_header=m68k/linux-unwind.h ;; m68k-*-linux*) # Motorola m68k's running GNU/Linux # with ELF format using glibc 2 # aka the GNU/Linux C library 6. + tmake_file="$tmake_file m68k/t-floatlib" # If not configured with --enable-sjlj-exceptions, bump the # libgcc version number. if test x$enable_sjlj_exceptions != xyes; then @@ -680,10 +687,11 @@ m68k-*-linux*) # Motorola m68k's running GNU/Linux md_unwind_header=m68k/linux-unwind.h ;; m68k-*-rtems*) + tmake_file="$tmake_file m68k/t-floatlib" extra_parts="$extra_parts crti.o crtn.o" ;; mcore-*-elf) - tmake_file=t-fdpbit + tmake_file="mcore/t-mcore t-fdpbit" extra_parts="$extra_parts crti.o crtn.o" ;; microblaze*-linux*) @@ -905,6 +913,10 @@ sh-*-elf* | sh[12346l]*-*-elf*) libic_invalidate_array_4-200.a \ libic_invalidate_array_4a.a \ libgcc-Os-4-200.a libgcc-4-300.a" + case ${host} in sh64*-*-*) + tmake_file="$tmake_file sh/t-sh64" + ;; + esac case ${host} in sh*-superh-elf) tmake_file="$tmake_file sh/t-superh" @@ -913,23 +925,33 @@ sh-*-elf* | sh[12346l]*-*-elf*) esac ;; sh-*-linux* | sh[2346lbe]*-*-linux*) - tmake_file="${tmake_file} t-slibgcc-libgcc sh/t-linux t-fdpbit" + tmake_file="${tmake_file} sh/t-sh t-slibgcc-libgcc sh/t-linux t-fdpbit" + case ${host} in sh64*-*-linux*) + tmake_file="$tmake_file sh/t-sh64" + ;; + esac md_unwind_header=sh/linux-unwind.h ;; sh-*-netbsdelf* | shl*-*-netbsdelf* | sh5-*-netbsd* | sh5l*-*-netbsd* | \ sh64-*-netbsd* | sh64l*-*-netbsd*) + tmake_file="$tmake_file sh/t-sh sh/t-netbsd" + case ${host} in + sh5*-*-netbsd* | sh64*-netbsd*) + tmake_file="$tmake_file sh/t-sh64" + ;; + esac # NetBSD's C library includes a fast software FP library that # has support for setting/setting the rounding mode, exception # mask, etc. Therefore, we don't want to include software FP # in libgcc. ;; sh-*-rtems*) - tmake_file="$tmake_file t-crtstuff-pic t-fdpbit" + tmake_file="$tmake_file sh/t-sh t-crtstuff-pic t-fdpbit" extra_parts="$extra_parts crt1.o crti.o crtn.o crtbeginS.o crtendS.o \ $sh_ic_extra_parts $sh_opt_extra_parts" ;; sh-wrs-vxworks) - tmake_file="$tmake_file t-crtstuff-pic t-fdpbit" + tmake_file="$tmake_file sh/t-sh t-crtstuff-pic t-fdpbit" ;; sparc-*-netbsdelf*) ;; @@ -956,6 +978,13 @@ sparc-*-linux*) # SPARC's running GNU/Linux, libc6 tmake_file="${tmake_file} sparc/t-linux" ;; esac + case ${host} in + *-leon[3-9]*) + ;; + *) + tmake_file="$tmake_file sparc/t-softmul" + ;; + esac extra_parts="$extra_parts crtfastmath.o" md_unwind_header=sparc/linux-unwind.h ;; @@ -1007,9 +1036,10 @@ tic6x-*-elf) unwind_header=config/c6x/unwind-c6x.h ;; v850*-*-*) - tmake_file=t-fdpbit + tmake_file="v850/t-v850 t-fdpbit" ;; vax-*-linux*) + tmake_file="$tmake_file vax/t-linux" ;; vax-*-netbsdelf*) ;; @@ -1032,6 +1062,7 @@ am33_2.0-*-linux*) tmake_file="$tmake_file t-fdpbit" ;; m32c-*-elf*|m32c-*-rtems*) + tmake_file="$tmake_file m32c/t-m32c" ;; mep*-*-*) tmake_file="mep/t-mep t-fdpbit" diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S new file mode 100644 index 00000000000..4ecea6da5a6 --- /dev/null +++ b/libgcc/config/arm/bpabi-v6m.S @@ -0,0 +1,318 @@ +/* Miscellaneous BPABI functions. ARMv6M implementation + + Copyright (C) 2006, 2008, 2009, 2010 Free Software Foundation, Inc. + Contributed by CodeSourcery. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef __ARM_EABI__ +/* Some attributes that are common to all routines in this file. */ + /* Tag_ABI_align_needed: This code does not require 8-byte + alignment from the caller. */ + /* .eabi_attribute 24, 0 -- default setting. */ + /* Tag_ABI_align_preserved: This code preserves 8-byte + alignment in any callee. */ + .eabi_attribute 25, 1 +#endif /* __ARM_EABI__ */ + +#ifdef L_aeabi_lcmp + +FUNC_START aeabi_lcmp + cmp xxh, yyh + beq 1f + bgt 2f + mov r0, #1 + neg r0, r0 + RET +2: + mov r0, #1 + RET +1: + sub r0, xxl, yyl + beq 1f + bhi 2f + mov r0, #1 + neg r0, r0 + RET +2: + mov r0, #1 +1: + RET + FUNC_END aeabi_lcmp + +#endif /* L_aeabi_lcmp */ + +#ifdef L_aeabi_ulcmp + +FUNC_START aeabi_ulcmp + cmp xxh, yyh + bne 1f + sub r0, xxl, yyl + beq 2f +1: + bcs 1f + mov r0, #1 + neg r0, r0 + RET +1: + mov r0, #1 +2: + RET + FUNC_END aeabi_ulcmp + +#endif /* L_aeabi_ulcmp */ + +.macro test_div_by_zero signed + cmp yyh, #0 + bne 7f + cmp yyl, #0 + bne 7f + cmp xxh, #0 + bne 2f + cmp xxl, #0 +2: + .ifc \signed, unsigned + beq 3f + mov xxh, #0 + mvn xxh, xxh @ 0xffffffff + mov xxl, xxh +3: + .else + beq 5f + blt 6f + mov xxl, #0 + mvn xxl, xxl @ 0xffffffff + lsr xxh, xxl, #1 @ 0x7fffffff + b 5f +6: mov xxh, #0x80 + lsl xxh, xxh, #24 @ 0x80000000 + mov xxl, #0 +5: + .endif + @ tailcalls are tricky on v6-m. + push {r0, r1, r2} + ldr r0, 1f + adr r1, 1f + add r0, r1 + str r0, [sp, #8] + @ We know we are not on armv4t, so pop pc is safe. + pop {r0, r1, pc} + .align 2 +1: + .word __aeabi_ldiv0 - 1b +7: +.endm + +#ifdef L_aeabi_ldivmod + +FUNC_START aeabi_ldivmod + test_div_by_zero signed + + push {r0, r1} + mov r0, sp + push {r0, lr} + ldr r0, [sp, #8] + bl SYM(__gnu_ldivmod_helper) + ldr r3, [sp, #4] + mov lr, r3 + add sp, sp, #8 + pop {r2, r3} + RET + FUNC_END aeabi_ldivmod + +#endif /* L_aeabi_ldivmod */ + +#ifdef L_aeabi_uldivmod + +FUNC_START aeabi_uldivmod + test_div_by_zero unsigned + + push {r0, r1} + mov r0, sp + push {r0, lr} + ldr r0, [sp, #8] + bl SYM(__gnu_uldivmod_helper) + ldr r3, [sp, #4] + mov lr, r3 + add sp, sp, #8 + pop {r2, r3} + RET + FUNC_END aeabi_uldivmod + +#endif /* L_aeabi_uldivmod */ + +#ifdef L_arm_addsubsf3 + +FUNC_START aeabi_frsub + + push {r4, lr} + mov r4, #1 + lsl r4, #31 + eor r0, r0, r4 + bl __aeabi_fadd + pop {r4, pc} + + FUNC_END aeabi_frsub + +#endif /* L_arm_addsubsf3 */ + +#ifdef L_arm_cmpsf2 + +FUNC_START aeabi_cfrcmple + + mov ip, r0 + mov r0, r1 + mov r1, ip + b 6f + +FUNC_START aeabi_cfcmpeq +FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq + + @ The status-returning routines are required to preserve all + @ registers except ip, lr, and cpsr. +6: push {r0, r1, r2, r3, r4, lr} + bl __lesf2 + @ Set the Z flag correctly, and the C flag unconditionally. + cmp r0, #0 + @ Clear the C flag if the return value was -1, indicating + @ that the first operand was smaller than the second. + bmi 1f + mov r1, #0 + cmn r0, r1 +1: + pop {r0, r1, r2, r3, r4, pc} + + FUNC_END aeabi_cfcmple + FUNC_END aeabi_cfcmpeq + FUNC_END aeabi_cfrcmple + +FUNC_START aeabi_fcmpeq + + push {r4, lr} + bl __eqsf2 + neg r0, r0 + add r0, r0, #1 + pop {r4, pc} + + FUNC_END aeabi_fcmpeq + +.macro COMPARISON cond, helper, mode=sf2 +FUNC_START aeabi_fcmp\cond + + push {r4, lr} + bl __\helper\mode + cmp r0, #0 + b\cond 1f + mov r0, #0 + pop {r4, pc} +1: + mov r0, #1 + pop {r4, pc} + + FUNC_END aeabi_fcmp\cond +.endm + +COMPARISON lt, le +COMPARISON le, le +COMPARISON gt, ge +COMPARISON ge, ge + +#endif /* L_arm_cmpsf2 */ + +#ifdef L_arm_addsubdf3 + +FUNC_START aeabi_drsub + + push {r4, lr} + mov r4, #1 + lsl r4, #31 + eor xxh, xxh, r4 + bl __aeabi_dadd + pop {r4, pc} + + FUNC_END aeabi_drsub + +#endif /* L_arm_addsubdf3 */ + +#ifdef L_arm_cmpdf2 + +FUNC_START aeabi_cdrcmple + + mov ip, r0 + mov r0, r2 + mov r2, ip + mov ip, r1 + mov r1, r3 + mov r3, ip + b 6f + +FUNC_START aeabi_cdcmpeq +FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq + + @ The status-returning routines are required to preserve all + @ registers except ip, lr, and cpsr. +6: push {r0, r1, r2, r3, r4, lr} + bl __ledf2 + @ Set the Z flag correctly, and the C flag unconditionally. + cmp r0, #0 + @ Clear the C flag if the return value was -1, indicating + @ that the first operand was smaller than the second. + bmi 1f + mov r1, #0 + cmn r0, r1 +1: + pop {r0, r1, r2, r3, r4, pc} + + FUNC_END aeabi_cdcmple + FUNC_END aeabi_cdcmpeq + FUNC_END aeabi_cdrcmple + +FUNC_START aeabi_dcmpeq + + push {r4, lr} + bl __eqdf2 + neg r0, r0 + add r0, r0, #1 + pop {r4, pc} + + FUNC_END aeabi_dcmpeq + +.macro COMPARISON cond, helper, mode=df2 +FUNC_START aeabi_dcmp\cond + + push {r4, lr} + bl __\helper\mode + cmp r0, #0 + b\cond 1f + mov r0, #0 + pop {r4, pc} +1: + mov r0, #1 + pop {r4, pc} + + FUNC_END aeabi_dcmp\cond +.endm + +COMPARISON lt, le +COMPARISON le, le +COMPARISON gt, ge +COMPARISON ge, ge + +#endif /* L_arm_cmpdf2 */ diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S new file mode 100644 index 00000000000..2ff338927fa --- /dev/null +++ b/libgcc/config/arm/bpabi.S @@ -0,0 +1,163 @@ +/* Miscellaneous BPABI functions. + + Copyright (C) 2003, 2004, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. + Contributed by CodeSourcery, LLC. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef __ARM_EABI__ +/* Some attributes that are common to all routines in this file. */ + /* Tag_ABI_align_needed: This code does not require 8-byte + alignment from the caller. */ + /* .eabi_attribute 24, 0 -- default setting. */ + /* Tag_ABI_align_preserved: This code preserves 8-byte + alignment in any callee. */ + .eabi_attribute 25, 1 +#endif /* __ARM_EABI__ */ + +#ifdef L_aeabi_lcmp + +ARM_FUNC_START aeabi_lcmp + cmp xxh, yyh + do_it lt + movlt r0, #-1 + do_it gt + movgt r0, #1 + do_it ne + RETc(ne) + subs r0, xxl, yyl + do_it lo + movlo r0, #-1 + do_it hi + movhi r0, #1 + RET + FUNC_END aeabi_lcmp + +#endif /* L_aeabi_lcmp */ + +#ifdef L_aeabi_ulcmp + +ARM_FUNC_START aeabi_ulcmp + cmp xxh, yyh + do_it lo + movlo r0, #-1 + do_it hi + movhi r0, #1 + do_it ne + RETc(ne) + cmp xxl, yyl + do_it lo + movlo r0, #-1 + do_it hi + movhi r0, #1 + do_it eq + moveq r0, #0 + RET + FUNC_END aeabi_ulcmp + +#endif /* L_aeabi_ulcmp */ + +.macro test_div_by_zero signed +/* Tail-call to divide-by-zero handlers which may be overridden by the user, + so unwinding works properly. */ +#if defined(__thumb2__) + cbnz yyh, 1f + cbnz yyl, 1f + cmp xxh, #0 + do_it eq + cmpeq xxl, #0 + .ifc \signed, unsigned + beq 2f + mov xxh, #0xffffffff + mov xxl, xxh +2: + .else + do_it lt, t + movlt xxl, #0 + movlt xxh, #0x80000000 + do_it gt, t + movgt xxh, #0x7fffffff + movgt xxl, #0xffffffff + .endif + b SYM (__aeabi_ldiv0) __PLT__ +1: +#else + /* Note: Thumb-1 code calls via an ARM shim on processors which + support ARM mode. */ + cmp yyh, #0 + cmpeq yyl, #0 + bne 2f + cmp xxh, #0 + cmpeq xxl, #0 + .ifc \signed, unsigned + movne xxh, #0xffffffff + movne xxl, #0xffffffff + .else + movlt xxh, #0x80000000 + movlt xxl, #0 + movgt xxh, #0x7fffffff + movgt xxl, #0xffffffff + .endif + b SYM (__aeabi_ldiv0) __PLT__ +2: +#endif +.endm + +#ifdef L_aeabi_ldivmod + +ARM_FUNC_START aeabi_ldivmod + test_div_by_zero signed + + sub sp, sp, #8 +#if defined(__thumb2__) + mov ip, sp + push {ip, lr} +#else + do_push {sp, lr} +#endif + bl SYM(__gnu_ldivmod_helper) __PLT__ + ldr lr, [sp, #4] + add sp, sp, #8 + do_pop {r2, r3} + RET + +#endif /* L_aeabi_ldivmod */ + +#ifdef L_aeabi_uldivmod + +ARM_FUNC_START aeabi_uldivmod + test_div_by_zero unsigned + + sub sp, sp, #8 +#if defined(__thumb2__) + mov ip, sp + push {ip, lr} +#else + do_push {sp, lr} +#endif + bl SYM(__gnu_uldivmod_helper) __PLT__ + ldr lr, [sp, #4] + add sp, sp, #8 + do_pop {r2, r3} + RET + +#endif /* L_aeabi_divmod */ + diff --git a/libgcc/config/arm/ieee754-df.S b/libgcc/config/arm/ieee754-df.S new file mode 100644 index 00000000000..eb0c38632d0 --- /dev/null +++ b/libgcc/config/arm/ieee754-df.S @@ -0,0 +1,1447 @@ +/* ieee754-df.S double-precision floating point support for ARM + + Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. + Contributed by Nicolas Pitre (nico@cam.org) + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* + * Notes: + * + * The goal of this code is to be as fast as possible. This is + * not meant to be easy to understand for the casual reader. + * For slightly simpler code please see the single precision version + * of this file. + * + * Only the default rounding mode is intended for best performances. + * Exceptions aren't supported yet, but that can be added quite easily + * if necessary without impacting performances. + */ + + +@ For FPA, float words are always big-endian. +@ For VFP, floats words follow the memory system mode. +#if defined(__VFP_FP__) && !defined(__ARMEB__) +#define xl r0 +#define xh r1 +#define yl r2 +#define yh r3 +#else +#define xh r0 +#define xl r1 +#define yh r2 +#define yl r3 +#endif + + +#ifdef L_arm_negdf2 + +ARM_FUNC_START negdf2 +ARM_FUNC_ALIAS aeabi_dneg negdf2 + + @ flip sign bit + eor xh, xh, #0x80000000 + RET + + FUNC_END aeabi_dneg + FUNC_END negdf2 + +#endif + +#ifdef L_arm_addsubdf3 + +ARM_FUNC_START aeabi_drsub + + eor xh, xh, #0x80000000 @ flip sign bit of first arg + b 1f + +ARM_FUNC_START subdf3 +ARM_FUNC_ALIAS aeabi_dsub subdf3 + + eor yh, yh, #0x80000000 @ flip sign bit of second arg +#if defined(__INTERWORKING_STUBS__) + b 1f @ Skip Thumb-code prologue +#endif + +ARM_FUNC_START adddf3 +ARM_FUNC_ALIAS aeabi_dadd adddf3 + +1: do_push {r4, r5, lr} + + @ Look for zeroes, equal values, INF, or NAN. + shift1 lsl, r4, xh, #1 + shift1 lsl, r5, yh, #1 + teq r4, r5 + do_it eq + teqeq xl, yl + do_it ne, ttt + COND(orr,s,ne) ip, r4, xl + COND(orr,s,ne) ip, r5, yl + COND(mvn,s,ne) ip, r4, asr #21 + COND(mvn,s,ne) ip, r5, asr #21 + beq LSYM(Lad_s) + + @ Compute exponent difference. Make largest exponent in r4, + @ corresponding arg in xh-xl, and positive exponent difference in r5. + shift1 lsr, r4, r4, #21 + rsbs r5, r4, r5, lsr #21 + do_it lt + rsblt r5, r5, #0 + ble 1f + add r4, r4, r5 + eor yl, xl, yl + eor yh, xh, yh + eor xl, yl, xl + eor xh, yh, xh + eor yl, xl, yl + eor yh, xh, yh +1: + @ If exponent difference is too large, return largest argument + @ already in xh-xl. We need up to 54 bit to handle proper rounding + @ of 0x1p54 - 1.1. + cmp r5, #54 + do_it hi + RETLDM "r4, r5" hi + + @ Convert mantissa to signed integer. + tst xh, #0x80000000 + mov xh, xh, lsl #12 + mov ip, #0x00100000 + orr xh, ip, xh, lsr #12 + beq 1f +#if defined(__thumb2__) + negs xl, xl + sbc xh, xh, xh, lsl #1 +#else + rsbs xl, xl, #0 + rsc xh, xh, #0 +#endif +1: + tst yh, #0x80000000 + mov yh, yh, lsl #12 + orr yh, ip, yh, lsr #12 + beq 1f +#if defined(__thumb2__) + negs yl, yl + sbc yh, yh, yh, lsl #1 +#else + rsbs yl, yl, #0 + rsc yh, yh, #0 +#endif +1: + @ If exponent == difference, one or both args were denormalized. + @ Since this is not common case, rescale them off line. + teq r4, r5 + beq LSYM(Lad_d) +LSYM(Lad_x): + + @ Compensate for the exponent overlapping the mantissa MSB added later + sub r4, r4, #1 + + @ Shift yh-yl right per r5, add to xh-xl, keep leftover bits into ip. + rsbs lr, r5, #32 + blt 1f + shift1 lsl, ip, yl, lr + shiftop adds xl xl yl lsr r5 yl + adc xh, xh, #0 + shiftop adds xl xl yh lsl lr yl + shiftop adcs xh xh yh asr r5 yh + b 2f +1: sub r5, r5, #32 + add lr, lr, #32 + cmp yl, #1 + shift1 lsl,ip, yh, lr + do_it cs + orrcs ip, ip, #2 @ 2 not 1, to allow lsr #1 later + shiftop adds xl xl yh asr r5 yh + adcs xh, xh, yh, asr #31 +2: + @ We now have a result in xh-xl-ip. + @ Keep absolute value in xh-xl-ip, sign in r5 (the n bit was set above) + and r5, xh, #0x80000000 + bpl LSYM(Lad_p) +#if defined(__thumb2__) + mov lr, #0 + negs ip, ip + sbcs xl, lr, xl + sbc xh, lr, xh +#else + rsbs ip, ip, #0 + rscs xl, xl, #0 + rsc xh, xh, #0 +#endif + + @ Determine how to normalize the result. +LSYM(Lad_p): + cmp xh, #0x00100000 + bcc LSYM(Lad_a) + cmp xh, #0x00200000 + bcc LSYM(Lad_e) + + @ Result needs to be shifted right. + movs xh, xh, lsr #1 + movs xl, xl, rrx + mov ip, ip, rrx + add r4, r4, #1 + + @ Make sure we did not bust our exponent. + mov r2, r4, lsl #21 + cmn r2, #(2 << 21) + bcs LSYM(Lad_o) + + @ Our result is now properly aligned into xh-xl, remaining bits in ip. + @ Round with MSB of ip. If halfway between two numbers, round towards + @ LSB of xl = 0. + @ Pack final result together. +LSYM(Lad_e): + cmp ip, #0x80000000 + do_it eq + COND(mov,s,eq) ip, xl, lsr #1 + adcs xl, xl, #0 + adc xh, xh, r4, lsl #20 + orr xh, xh, r5 + RETLDM "r4, r5" + + @ Result must be shifted left and exponent adjusted. +LSYM(Lad_a): + movs ip, ip, lsl #1 + adcs xl, xl, xl + adc xh, xh, xh + tst xh, #0x00100000 + sub r4, r4, #1 + bne LSYM(Lad_e) + + @ No rounding necessary since ip will always be 0 at this point. +LSYM(Lad_l): + +#if __ARM_ARCH__ < 5 + + teq xh, #0 + movne r3, #20 + moveq r3, #52 + moveq xh, xl + moveq xl, #0 + mov r2, xh + cmp r2, #(1 << 16) + movhs r2, r2, lsr #16 + subhs r3, r3, #16 + cmp r2, #(1 << 8) + movhs r2, r2, lsr #8 + subhs r3, r3, #8 + cmp r2, #(1 << 4) + movhs r2, r2, lsr #4 + subhs r3, r3, #4 + cmp r2, #(1 << 2) + subhs r3, r3, #2 + sublo r3, r3, r2, lsr #1 + sub r3, r3, r2, lsr #3 + +#else + + teq xh, #0 + do_it eq, t + moveq xh, xl + moveq xl, #0 + clz r3, xh + do_it eq + addeq r3, r3, #32 + sub r3, r3, #11 + +#endif + + @ determine how to shift the value. + subs r2, r3, #32 + bge 2f + adds r2, r2, #12 + ble 1f + + @ shift value left 21 to 31 bits, or actually right 11 to 1 bits + @ since a register switch happened above. + add ip, r2, #20 + rsb r2, r2, #12 + shift1 lsl, xl, xh, ip + shift1 lsr, xh, xh, r2 + b 3f + + @ actually shift value left 1 to 20 bits, which might also represent + @ 32 to 52 bits if counting the register switch that happened earlier. +1: add r2, r2, #20 +2: do_it le + rsble ip, r2, #32 + shift1 lsl, xh, xh, r2 +#if defined(__thumb2__) + lsr ip, xl, ip + itt le + orrle xh, xh, ip + lslle xl, xl, r2 +#else + orrle xh, xh, xl, lsr ip + movle xl, xl, lsl r2 +#endif + + @ adjust exponent accordingly. +3: subs r4, r4, r3 + do_it ge, tt + addge xh, xh, r4, lsl #20 + orrge xh, xh, r5 + RETLDM "r4, r5" ge + + @ Exponent too small, denormalize result. + @ Find out proper shift value. + mvn r4, r4 + subs r4, r4, #31 + bge 2f + adds r4, r4, #12 + bgt 1f + + @ shift result right of 1 to 20 bits, sign is in r5. + add r4, r4, #20 + rsb r2, r4, #32 + shift1 lsr, xl, xl, r4 + shiftop orr xl xl xh lsl r2 yh + shiftop orr xh r5 xh lsr r4 yh + RETLDM "r4, r5" + + @ shift result right of 21 to 31 bits, or left 11 to 1 bits after + @ a register switch from xh to xl. +1: rsb r4, r4, #12 + rsb r2, r4, #32 + shift1 lsr, xl, xl, r2 + shiftop orr xl xl xh lsl r4 yh + mov xh, r5 + RETLDM "r4, r5" + + @ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch + @ from xh to xl. +2: shift1 lsr, xl, xh, r4 + mov xh, r5 + RETLDM "r4, r5" + + @ Adjust exponents for denormalized arguments. + @ Note that r4 must not remain equal to 0. +LSYM(Lad_d): + teq r4, #0 + eor yh, yh, #0x00100000 + do_it eq, te + eoreq xh, xh, #0x00100000 + addeq r4, r4, #1 + subne r5, r5, #1 + b LSYM(Lad_x) + + +LSYM(Lad_s): + mvns ip, r4, asr #21 + do_it ne + COND(mvn,s,ne) ip, r5, asr #21 + beq LSYM(Lad_i) + + teq r4, r5 + do_it eq + teqeq xl, yl + beq 1f + + @ Result is x + 0.0 = x or 0.0 + y = y. + orrs ip, r4, xl + do_it eq, t + moveq xh, yh + moveq xl, yl + RETLDM "r4, r5" + +1: teq xh, yh + + @ Result is x - x = 0. + do_it ne, tt + movne xh, #0 + movne xl, #0 + RETLDM "r4, r5" ne + + @ Result is x + x = 2x. + movs ip, r4, lsr #21 + bne 2f + movs xl, xl, lsl #1 + adcs xh, xh, xh + do_it cs + orrcs xh, xh, #0x80000000 + RETLDM "r4, r5" +2: adds r4, r4, #(2 << 21) + do_it cc, t + addcc xh, xh, #(1 << 20) + RETLDM "r4, r5" cc + and r5, xh, #0x80000000 + + @ Overflow: return INF. +LSYM(Lad_o): + orr xh, r5, #0x7f000000 + orr xh, xh, #0x00f00000 + mov xl, #0 + RETLDM "r4, r5" + + @ At least one of x or y is INF/NAN. + @ if xh-xl != INF/NAN: return yh-yl (which is INF/NAN) + @ if yh-yl != INF/NAN: return xh-xl (which is INF/NAN) + @ if either is NAN: return NAN + @ if opposite sign: return NAN + @ otherwise return xh-xl (which is INF or -INF) +LSYM(Lad_i): + mvns ip, r4, asr #21 + do_it ne, te + movne xh, yh + movne xl, yl + COND(mvn,s,eq) ip, r5, asr #21 + do_it ne, t + movne yh, xh + movne yl, xl + orrs r4, xl, xh, lsl #12 + do_it eq, te + COND(orr,s,eq) r5, yl, yh, lsl #12 + teqeq xh, yh + orrne xh, xh, #0x00080000 @ quiet NAN + RETLDM "r4, r5" + + FUNC_END aeabi_dsub + FUNC_END subdf3 + FUNC_END aeabi_dadd + FUNC_END adddf3 + +ARM_FUNC_START floatunsidf +ARM_FUNC_ALIAS aeabi_ui2d floatunsidf + + teq r0, #0 + do_it eq, t + moveq r1, #0 + RETc(eq) + do_push {r4, r5, lr} + mov r4, #0x400 @ initial exponent + add r4, r4, #(52-1 - 1) + mov r5, #0 @ sign bit is 0 + .ifnc xl, r0 + mov xl, r0 + .endif + mov xh, #0 + b LSYM(Lad_l) + + FUNC_END aeabi_ui2d + FUNC_END floatunsidf + +ARM_FUNC_START floatsidf +ARM_FUNC_ALIAS aeabi_i2d floatsidf + + teq r0, #0 + do_it eq, t + moveq r1, #0 + RETc(eq) + do_push {r4, r5, lr} + mov r4, #0x400 @ initial exponent + add r4, r4, #(52-1 - 1) + ands r5, r0, #0x80000000 @ sign bit in r5 + do_it mi + rsbmi r0, r0, #0 @ absolute value + .ifnc xl, r0 + mov xl, r0 + .endif + mov xh, #0 + b LSYM(Lad_l) + + FUNC_END aeabi_i2d + FUNC_END floatsidf + +ARM_FUNC_START extendsfdf2 +ARM_FUNC_ALIAS aeabi_f2d extendsfdf2 + + movs r2, r0, lsl #1 @ toss sign bit + mov xh, r2, asr #3 @ stretch exponent + mov xh, xh, rrx @ retrieve sign bit + mov xl, r2, lsl #28 @ retrieve remaining bits + do_it ne, ttt + COND(and,s,ne) r3, r2, #0xff000000 @ isolate exponent + teqne r3, #0xff000000 @ if not 0, check if INF or NAN + eorne xh, xh, #0x38000000 @ fixup exponent otherwise. + RETc(ne) @ and return it. + + teq r2, #0 @ if actually 0 + do_it ne, e + teqne r3, #0xff000000 @ or INF or NAN + RETc(eq) @ we are done already. + + @ value was denormalized. We can normalize it now. + do_push {r4, r5, lr} + mov r4, #0x380 @ setup corresponding exponent + and r5, xh, #0x80000000 @ move sign bit in r5 + bic xh, xh, #0x80000000 + b LSYM(Lad_l) + + FUNC_END aeabi_f2d + FUNC_END extendsfdf2 + +ARM_FUNC_START floatundidf +ARM_FUNC_ALIAS aeabi_ul2d floatundidf + + orrs r2, r0, r1 +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + do_it eq, t + mvfeqd f0, #0.0 +#else + do_it eq +#endif + RETc(eq) + +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + @ For hard FPA code we want to return via the tail below so that + @ we can return the result in f0 as well as in r0/r1 for backwards + @ compatibility. + adr ip, LSYM(f0_ret) + @ Push pc as well so that RETLDM works correctly. + do_push {r4, r5, ip, lr, pc} +#else + do_push {r4, r5, lr} +#endif + + mov r5, #0 + b 2f + +ARM_FUNC_START floatdidf +ARM_FUNC_ALIAS aeabi_l2d floatdidf + + orrs r2, r0, r1 +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + do_it eq, t + mvfeqd f0, #0.0 +#else + do_it eq +#endif + RETc(eq) + +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + @ For hard FPA code we want to return via the tail below so that + @ we can return the result in f0 as well as in r0/r1 for backwards + @ compatibility. + adr ip, LSYM(f0_ret) + @ Push pc as well so that RETLDM works correctly. + do_push {r4, r5, ip, lr, pc} +#else + do_push {r4, r5, lr} +#endif + + ands r5, ah, #0x80000000 @ sign bit in r5 + bpl 2f +#if defined(__thumb2__) + negs al, al + sbc ah, ah, ah, lsl #1 +#else + rsbs al, al, #0 + rsc ah, ah, #0 +#endif +2: + mov r4, #0x400 @ initial exponent + add r4, r4, #(52-1 - 1) + + @ FPA little-endian: must swap the word order. + .ifnc xh, ah + mov ip, al + mov xh, ah + mov xl, ip + .endif + + movs ip, xh, lsr #22 + beq LSYM(Lad_p) + + @ The value is too big. Scale it down a bit... + mov r2, #3 + movs ip, ip, lsr #3 + do_it ne + addne r2, r2, #3 + movs ip, ip, lsr #3 + do_it ne + addne r2, r2, #3 + add r2, r2, ip, lsr #3 + + rsb r3, r2, #32 + shift1 lsl, ip, xl, r3 + shift1 lsr, xl, xl, r2 + shiftop orr xl xl xh lsl r3 lr + shift1 lsr, xh, xh, r2 + add r4, r4, r2 + b LSYM(Lad_p) + +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + + @ Legacy code expects the result to be returned in f0. Copy it + @ there as well. +LSYM(f0_ret): + do_push {r0, r1} + ldfd f0, [sp], #8 + RETLDM + +#endif + + FUNC_END floatdidf + FUNC_END aeabi_l2d + FUNC_END floatundidf + FUNC_END aeabi_ul2d + +#endif /* L_addsubdf3 */ + +#ifdef L_arm_muldivdf3 + +ARM_FUNC_START muldf3 +ARM_FUNC_ALIAS aeabi_dmul muldf3 + do_push {r4, r5, r6, lr} + + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + orr ip, ip, #0x700 + ands r4, ip, xh, lsr #20 + do_it ne, tte + COND(and,s,ne) r5, ip, yh, lsr #20 + teqne r4, ip + teqne r5, ip + bleq LSYM(Lml_s) + + @ Add exponents together + add r4, r4, r5 + + @ Determine final sign. + eor r6, xh, yh + + @ Convert mantissa to unsigned integer. + @ If power of two, branch to a separate path. + bic xh, xh, ip, lsl #21 + bic yh, yh, ip, lsl #21 + orrs r5, xl, xh, lsl #12 + do_it ne + COND(orr,s,ne) r5, yl, yh, lsl #12 + orr xh, xh, #0x00100000 + orr yh, yh, #0x00100000 + beq LSYM(Lml_1) + +#if __ARM_ARCH__ < 4 + + @ Put sign bit in r6, which will be restored in yl later. + and r6, r6, #0x80000000 + + @ Well, no way to make it shorter without the umull instruction. + stmfd sp!, {r6, r7, r8, r9, sl, fp} + mov r7, xl, lsr #16 + mov r8, yl, lsr #16 + mov r9, xh, lsr #16 + mov sl, yh, lsr #16 + bic xl, xl, r7, lsl #16 + bic yl, yl, r8, lsl #16 + bic xh, xh, r9, lsl #16 + bic yh, yh, sl, lsl #16 + mul ip, xl, yl + mul fp, xl, r8 + mov lr, #0 + adds ip, ip, fp, lsl #16 + adc lr, lr, fp, lsr #16 + mul fp, r7, yl + adds ip, ip, fp, lsl #16 + adc lr, lr, fp, lsr #16 + mul fp, xl, sl + mov r5, #0 + adds lr, lr, fp, lsl #16 + adc r5, r5, fp, lsr #16 + mul fp, r7, yh + adds lr, lr, fp, lsl #16 + adc r5, r5, fp, lsr #16 + mul fp, xh, r8 + adds lr, lr, fp, lsl #16 + adc r5, r5, fp, lsr #16 + mul fp, r9, yl + adds lr, lr, fp, lsl #16 + adc r5, r5, fp, lsr #16 + mul fp, xh, sl + mul r6, r9, sl + adds r5, r5, fp, lsl #16 + adc r6, r6, fp, lsr #16 + mul fp, r9, yh + adds r5, r5, fp, lsl #16 + adc r6, r6, fp, lsr #16 + mul fp, xl, yh + adds lr, lr, fp + mul fp, r7, sl + adcs r5, r5, fp + mul fp, xh, yl + adc r6, r6, #0 + adds lr, lr, fp + mul fp, r9, r8 + adcs r5, r5, fp + mul fp, r7, r8 + adc r6, r6, #0 + adds lr, lr, fp + mul fp, xh, yh + adcs r5, r5, fp + adc r6, r6, #0 + ldmfd sp!, {yl, r7, r8, r9, sl, fp} + +#else + + @ Here is the actual multiplication. + umull ip, lr, xl, yl + mov r5, #0 + umlal lr, r5, xh, yl + and yl, r6, #0x80000000 + umlal lr, r5, xl, yh + mov r6, #0 + umlal r5, r6, xh, yh + +#endif + + @ The LSBs in ip are only significant for the final rounding. + @ Fold them into lr. + teq ip, #0 + do_it ne + orrne lr, lr, #1 + + @ Adjust result upon the MSB position. + sub r4, r4, #0xff + cmp r6, #(1 << (20-11)) + sbc r4, r4, #0x300 + bcs 1f + movs lr, lr, lsl #1 + adcs r5, r5, r5 + adc r6, r6, r6 +1: + @ Shift to final position, add sign to result. + orr xh, yl, r6, lsl #11 + orr xh, xh, r5, lsr #21 + mov xl, r5, lsl #11 + orr xl, xl, lr, lsr #21 + mov lr, lr, lsl #11 + + @ Check exponent range for under/overflow. + subs ip, r4, #(254 - 1) + do_it hi + cmphi ip, #0x700 + bhi LSYM(Lml_u) + + @ Round the result, merge final exponent. + cmp lr, #0x80000000 + do_it eq + COND(mov,s,eq) lr, xl, lsr #1 + adcs xl, xl, #0 + adc xh, xh, r4, lsl #20 + RETLDM "r4, r5, r6" + + @ Multiplication by 0x1p*: let''s shortcut a lot of code. +LSYM(Lml_1): + and r6, r6, #0x80000000 + orr xh, r6, xh + orr xl, xl, yl + eor xh, xh, yh + subs r4, r4, ip, lsr #1 + do_it gt, tt + COND(rsb,s,gt) r5, r4, ip + orrgt xh, xh, r4, lsl #20 + RETLDM "r4, r5, r6" gt + + @ Under/overflow: fix things up for the code below. + orr xh, xh, #0x00100000 + mov lr, #0 + subs r4, r4, #1 + +LSYM(Lml_u): + @ Overflow? + bgt LSYM(Lml_o) + + @ Check if denormalized result is possible, otherwise return signed 0. + cmn r4, #(53 + 1) + do_it le, tt + movle xl, #0 + bicle xh, xh, #0x7fffffff + RETLDM "r4, r5, r6" le + + @ Find out proper shift value. + rsb r4, r4, #0 + subs r4, r4, #32 + bge 2f + adds r4, r4, #12 + bgt 1f + + @ shift result right of 1 to 20 bits, preserve sign bit, round, etc. + add r4, r4, #20 + rsb r5, r4, #32 + shift1 lsl, r3, xl, r5 + shift1 lsr, xl, xl, r4 + shiftop orr xl xl xh lsl r5 r2 + and r2, xh, #0x80000000 + bic xh, xh, #0x80000000 + adds xl, xl, r3, lsr #31 + shiftop adc xh r2 xh lsr r4 r6 + orrs lr, lr, r3, lsl #1 + do_it eq + biceq xl, xl, r3, lsr #31 + RETLDM "r4, r5, r6" + + @ shift result right of 21 to 31 bits, or left 11 to 1 bits after + @ a register switch from xh to xl. Then round. +1: rsb r4, r4, #12 + rsb r5, r4, #32 + shift1 lsl, r3, xl, r4 + shift1 lsr, xl, xl, r5 + shiftop orr xl xl xh lsl r4 r2 + bic xh, xh, #0x7fffffff + adds xl, xl, r3, lsr #31 + adc xh, xh, #0 + orrs lr, lr, r3, lsl #1 + do_it eq + biceq xl, xl, r3, lsr #31 + RETLDM "r4, r5, r6" + + @ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch + @ from xh to xl. Leftover bits are in r3-r6-lr for rounding. +2: rsb r5, r4, #32 + shiftop orr lr lr xl lsl r5 r2 + shift1 lsr, r3, xl, r4 + shiftop orr r3 r3 xh lsl r5 r2 + shift1 lsr, xl, xh, r4 + bic xh, xh, #0x7fffffff + shiftop bic xl xl xh lsr r4 r2 + add xl, xl, r3, lsr #31 + orrs lr, lr, r3, lsl #1 + do_it eq + biceq xl, xl, r3, lsr #31 + RETLDM "r4, r5, r6" + + @ One or both arguments are denormalized. + @ Scale them leftwards and preserve sign bit. +LSYM(Lml_d): + teq r4, #0 + bne 2f + and r6, xh, #0x80000000 +1: movs xl, xl, lsl #1 + adc xh, xh, xh + tst xh, #0x00100000 + do_it eq + subeq r4, r4, #1 + beq 1b + orr xh, xh, r6 + teq r5, #0 + do_it ne + RETc(ne) +2: and r6, yh, #0x80000000 +3: movs yl, yl, lsl #1 + adc yh, yh, yh + tst yh, #0x00100000 + do_it eq + subeq r5, r5, #1 + beq 3b + orr yh, yh, r6 + RET + +LSYM(Lml_s): + @ Isolate the INF and NAN cases away + teq r4, ip + and r5, ip, yh, lsr #20 + do_it ne + teqne r5, ip + beq 1f + + @ Here, one or more arguments are either denormalized or zero. + orrs r6, xl, xh, lsl #1 + do_it ne + COND(orr,s,ne) r6, yl, yh, lsl #1 + bne LSYM(Lml_d) + + @ Result is 0, but determine sign anyway. +LSYM(Lml_z): + eor xh, xh, yh + and xh, xh, #0x80000000 + mov xl, #0 + RETLDM "r4, r5, r6" + +1: @ One or both args are INF or NAN. + orrs r6, xl, xh, lsl #1 + do_it eq, te + moveq xl, yl + moveq xh, yh + COND(orr,s,ne) r6, yl, yh, lsl #1 + beq LSYM(Lml_n) @ 0 * INF or INF * 0 -> NAN + teq r4, ip + bne 1f + orrs r6, xl, xh, lsl #12 + bne LSYM(Lml_n) @ NAN * <anything> -> NAN +1: teq r5, ip + bne LSYM(Lml_i) + orrs r6, yl, yh, lsl #12 + do_it ne, t + movne xl, yl + movne xh, yh + bne LSYM(Lml_n) @ <anything> * NAN -> NAN + + @ Result is INF, but we need to determine its sign. +LSYM(Lml_i): + eor xh, xh, yh + + @ Overflow: return INF (sign already in xh). +LSYM(Lml_o): + and xh, xh, #0x80000000 + orr xh, xh, #0x7f000000 + orr xh, xh, #0x00f00000 + mov xl, #0 + RETLDM "r4, r5, r6" + + @ Return a quiet NAN. +LSYM(Lml_n): + orr xh, xh, #0x7f000000 + orr xh, xh, #0x00f80000 + RETLDM "r4, r5, r6" + + FUNC_END aeabi_dmul + FUNC_END muldf3 + +ARM_FUNC_START divdf3 +ARM_FUNC_ALIAS aeabi_ddiv divdf3 + + do_push {r4, r5, r6, lr} + + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + orr ip, ip, #0x700 + ands r4, ip, xh, lsr #20 + do_it ne, tte + COND(and,s,ne) r5, ip, yh, lsr #20 + teqne r4, ip + teqne r5, ip + bleq LSYM(Ldv_s) + + @ Substract divisor exponent from dividend''s. + sub r4, r4, r5 + + @ Preserve final sign into lr. + eor lr, xh, yh + + @ Convert mantissa to unsigned integer. + @ Dividend -> r5-r6, divisor -> yh-yl. + orrs r5, yl, yh, lsl #12 + mov xh, xh, lsl #12 + beq LSYM(Ldv_1) + mov yh, yh, lsl #12 + mov r5, #0x10000000 + orr yh, r5, yh, lsr #4 + orr yh, yh, yl, lsr #24 + mov yl, yl, lsl #8 + orr r5, r5, xh, lsr #4 + orr r5, r5, xl, lsr #24 + mov r6, xl, lsl #8 + + @ Initialize xh with final sign bit. + and xh, lr, #0x80000000 + + @ Ensure result will land to known bit position. + @ Apply exponent bias accordingly. + cmp r5, yh + do_it eq + cmpeq r6, yl + adc r4, r4, #(255 - 2) + add r4, r4, #0x300 + bcs 1f + movs yh, yh, lsr #1 + mov yl, yl, rrx +1: + @ Perform first substraction to align result to a nibble. + subs r6, r6, yl + sbc r5, r5, yh + movs yh, yh, lsr #1 + mov yl, yl, rrx + mov xl, #0x00100000 + mov ip, #0x00080000 + + @ The actual division loop. +1: subs lr, r6, yl + sbcs lr, r5, yh + do_it cs, tt + subcs r6, r6, yl + movcs r5, lr + orrcs xl, xl, ip + movs yh, yh, lsr #1 + mov yl, yl, rrx + subs lr, r6, yl + sbcs lr, r5, yh + do_it cs, tt + subcs r6, r6, yl + movcs r5, lr + orrcs xl, xl, ip, lsr #1 + movs yh, yh, lsr #1 + mov yl, yl, rrx + subs lr, r6, yl + sbcs lr, r5, yh + do_it cs, tt + subcs r6, r6, yl + movcs r5, lr + orrcs xl, xl, ip, lsr #2 + movs yh, yh, lsr #1 + mov yl, yl, rrx + subs lr, r6, yl + sbcs lr, r5, yh + do_it cs, tt + subcs r6, r6, yl + movcs r5, lr + orrcs xl, xl, ip, lsr #3 + + orrs lr, r5, r6 + beq 2f + mov r5, r5, lsl #4 + orr r5, r5, r6, lsr #28 + mov r6, r6, lsl #4 + mov yh, yh, lsl #3 + orr yh, yh, yl, lsr #29 + mov yl, yl, lsl #3 + movs ip, ip, lsr #4 + bne 1b + + @ We are done with a word of the result. + @ Loop again for the low word if this pass was for the high word. + tst xh, #0x00100000 + bne 3f + orr xh, xh, xl + mov xl, #0 + mov ip, #0x80000000 + b 1b +2: + @ Be sure result starts in the high word. + tst xh, #0x00100000 + do_it eq, t + orreq xh, xh, xl + moveq xl, #0 +3: + @ Check exponent range for under/overflow. + subs ip, r4, #(254 - 1) + do_it hi + cmphi ip, #0x700 + bhi LSYM(Lml_u) + + @ Round the result, merge final exponent. + subs ip, r5, yh + do_it eq, t + COND(sub,s,eq) ip, r6, yl + COND(mov,s,eq) ip, xl, lsr #1 + adcs xl, xl, #0 + adc xh, xh, r4, lsl #20 + RETLDM "r4, r5, r6" + + @ Division by 0x1p*: shortcut a lot of code. +LSYM(Ldv_1): + and lr, lr, #0x80000000 + orr xh, lr, xh, lsr #12 + adds r4, r4, ip, lsr #1 + do_it gt, tt + COND(rsb,s,gt) r5, r4, ip + orrgt xh, xh, r4, lsl #20 + RETLDM "r4, r5, r6" gt + + orr xh, xh, #0x00100000 + mov lr, #0 + subs r4, r4, #1 + b LSYM(Lml_u) + + @ Result mightt need to be denormalized: put remainder bits + @ in lr for rounding considerations. +LSYM(Ldv_u): + orr lr, r5, r6 + b LSYM(Lml_u) + + @ One or both arguments is either INF, NAN or zero. +LSYM(Ldv_s): + and r5, ip, yh, lsr #20 + teq r4, ip + do_it eq + teqeq r5, ip + beq LSYM(Lml_n) @ INF/NAN / INF/NAN -> NAN + teq r4, ip + bne 1f + orrs r4, xl, xh, lsl #12 + bne LSYM(Lml_n) @ NAN / <anything> -> NAN + teq r5, ip + bne LSYM(Lml_i) @ INF / <anything> -> INF + mov xl, yl + mov xh, yh + b LSYM(Lml_n) @ INF / (INF or NAN) -> NAN +1: teq r5, ip + bne 2f + orrs r5, yl, yh, lsl #12 + beq LSYM(Lml_z) @ <anything> / INF -> 0 + mov xl, yl + mov xh, yh + b LSYM(Lml_n) @ <anything> / NAN -> NAN +2: @ If both are nonzero, we need to normalize and resume above. + orrs r6, xl, xh, lsl #1 + do_it ne + COND(orr,s,ne) r6, yl, yh, lsl #1 + bne LSYM(Lml_d) + @ One or both arguments are 0. + orrs r4, xl, xh, lsl #1 + bne LSYM(Lml_i) @ <non_zero> / 0 -> INF + orrs r5, yl, yh, lsl #1 + bne LSYM(Lml_z) @ 0 / <non_zero> -> 0 + b LSYM(Lml_n) @ 0 / 0 -> NAN + + FUNC_END aeabi_ddiv + FUNC_END divdf3 + +#endif /* L_muldivdf3 */ + +#ifdef L_arm_cmpdf2 + +@ Note: only r0 (return value) and ip are clobbered here. + +ARM_FUNC_START gtdf2 +ARM_FUNC_ALIAS gedf2 gtdf2 + mov ip, #-1 + b 1f + +ARM_FUNC_START ltdf2 +ARM_FUNC_ALIAS ledf2 ltdf2 + mov ip, #1 + b 1f + +ARM_FUNC_START cmpdf2 +ARM_FUNC_ALIAS nedf2 cmpdf2 +ARM_FUNC_ALIAS eqdf2 cmpdf2 + mov ip, #1 @ how should we specify unordered here? + +1: str ip, [sp, #-4]! + + @ Trap any INF/NAN first. + mov ip, xh, lsl #1 + mvns ip, ip, asr #21 + mov ip, yh, lsl #1 + do_it ne + COND(mvn,s,ne) ip, ip, asr #21 + beq 3f + + @ Test for equality. + @ Note that 0.0 is equal to -0.0. +2: add sp, sp, #4 + orrs ip, xl, xh, lsl #1 @ if x == 0.0 or -0.0 + do_it eq, e + COND(orr,s,eq) ip, yl, yh, lsl #1 @ and y == 0.0 or -0.0 + teqne xh, yh @ or xh == yh + do_it eq, tt + teqeq xl, yl @ and xl == yl + moveq r0, #0 @ then equal. + RETc(eq) + + @ Clear C flag + cmn r0, #0 + + @ Compare sign, + teq xh, yh + + @ Compare values if same sign + do_it pl + cmppl xh, yh + do_it eq + cmpeq xl, yl + + @ Result: + do_it cs, e + movcs r0, yh, asr #31 + mvncc r0, yh, asr #31 + orr r0, r0, #1 + RET + + @ Look for a NAN. +3: mov ip, xh, lsl #1 + mvns ip, ip, asr #21 + bne 4f + orrs ip, xl, xh, lsl #12 + bne 5f @ x is NAN +4: mov ip, yh, lsl #1 + mvns ip, ip, asr #21 + bne 2b + orrs ip, yl, yh, lsl #12 + beq 2b @ y is not NAN +5: ldr r0, [sp], #4 @ unordered return code + RET + + FUNC_END gedf2 + FUNC_END gtdf2 + FUNC_END ledf2 + FUNC_END ltdf2 + FUNC_END nedf2 + FUNC_END eqdf2 + FUNC_END cmpdf2 + +ARM_FUNC_START aeabi_cdrcmple + + mov ip, r0 + mov r0, r2 + mov r2, ip + mov ip, r1 + mov r1, r3 + mov r3, ip + b 6f + +ARM_FUNC_START aeabi_cdcmpeq +ARM_FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq + + @ The status-returning routines are required to preserve all + @ registers except ip, lr, and cpsr. +6: do_push {r0, lr} + ARM_CALL cmpdf2 + @ Set the Z flag correctly, and the C flag unconditionally. + cmp r0, #0 + @ Clear the C flag if the return value was -1, indicating + @ that the first operand was smaller than the second. + do_it mi + cmnmi r0, #0 + RETLDM "r0" + + FUNC_END aeabi_cdcmple + FUNC_END aeabi_cdcmpeq + FUNC_END aeabi_cdrcmple + +ARM_FUNC_START aeabi_dcmpeq + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdcmple + do_it eq, e + moveq r0, #1 @ Equal to. + movne r0, #0 @ Less than, greater than, or unordered. + RETLDM + + FUNC_END aeabi_dcmpeq + +ARM_FUNC_START aeabi_dcmplt + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdcmple + do_it cc, e + movcc r0, #1 @ Less than. + movcs r0, #0 @ Equal to, greater than, or unordered. + RETLDM + + FUNC_END aeabi_dcmplt + +ARM_FUNC_START aeabi_dcmple + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdcmple + do_it ls, e + movls r0, #1 @ Less than or equal to. + movhi r0, #0 @ Greater than or unordered. + RETLDM + + FUNC_END aeabi_dcmple + +ARM_FUNC_START aeabi_dcmpge + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdrcmple + do_it ls, e + movls r0, #1 @ Operand 2 is less than or equal to operand 1. + movhi r0, #0 @ Operand 2 greater than operand 1, or unordered. + RETLDM + + FUNC_END aeabi_dcmpge + +ARM_FUNC_START aeabi_dcmpgt + + str lr, [sp, #-8]! + ARM_CALL aeabi_cdrcmple + do_it cc, e + movcc r0, #1 @ Operand 2 is less than operand 1. + movcs r0, #0 @ Operand 2 is greater than or equal to operand 1, + @ or they are unordered. + RETLDM + + FUNC_END aeabi_dcmpgt + +#endif /* L_cmpdf2 */ + +#ifdef L_arm_unorddf2 + +ARM_FUNC_START unorddf2 +ARM_FUNC_ALIAS aeabi_dcmpun unorddf2 + + mov ip, xh, lsl #1 + mvns ip, ip, asr #21 + bne 1f + orrs ip, xl, xh, lsl #12 + bne 3f @ x is NAN +1: mov ip, yh, lsl #1 + mvns ip, ip, asr #21 + bne 2f + orrs ip, yl, yh, lsl #12 + bne 3f @ y is NAN +2: mov r0, #0 @ arguments are ordered. + RET + +3: mov r0, #1 @ arguments are unordered. + RET + + FUNC_END aeabi_dcmpun + FUNC_END unorddf2 + +#endif /* L_unorddf2 */ + +#ifdef L_arm_fixdfsi + +ARM_FUNC_START fixdfsi +ARM_FUNC_ALIAS aeabi_d2iz fixdfsi + + @ check exponent range. + mov r2, xh, lsl #1 + adds r2, r2, #(1 << 21) + bcs 2f @ value is INF or NAN + bpl 1f @ value is too small + mov r3, #(0xfffffc00 + 31) + subs r2, r3, r2, asr #21 + bls 3f @ value is too large + + @ scale value + mov r3, xh, lsl #11 + orr r3, r3, #0x80000000 + orr r3, r3, xl, lsr #21 + tst xh, #0x80000000 @ the sign bit + shift1 lsr, r0, r3, r2 + do_it ne + rsbne r0, r0, #0 + RET + +1: mov r0, #0 + RET + +2: orrs xl, xl, xh, lsl #12 + bne 4f @ x is NAN. +3: ands r0, xh, #0x80000000 @ the sign bit + do_it eq + moveq r0, #0x7fffffff @ maximum signed positive si + RET + +4: mov r0, #0 @ How should we convert NAN? + RET + + FUNC_END aeabi_d2iz + FUNC_END fixdfsi + +#endif /* L_fixdfsi */ + +#ifdef L_arm_fixunsdfsi + +ARM_FUNC_START fixunsdfsi +ARM_FUNC_ALIAS aeabi_d2uiz fixunsdfsi + + @ check exponent range. + movs r2, xh, lsl #1 + bcs 1f @ value is negative + adds r2, r2, #(1 << 21) + bcs 2f @ value is INF or NAN + bpl 1f @ value is too small + mov r3, #(0xfffffc00 + 31) + subs r2, r3, r2, asr #21 + bmi 3f @ value is too large + + @ scale value + mov r3, xh, lsl #11 + orr r3, r3, #0x80000000 + orr r3, r3, xl, lsr #21 + shift1 lsr, r0, r3, r2 + RET + +1: mov r0, #0 + RET + +2: orrs xl, xl, xh, lsl #12 + bne 4f @ value is NAN. +3: mov r0, #0xffffffff @ maximum unsigned si + RET + +4: mov r0, #0 @ How should we convert NAN? + RET + + FUNC_END aeabi_d2uiz + FUNC_END fixunsdfsi + +#endif /* L_fixunsdfsi */ + +#ifdef L_arm_truncdfsf2 + +ARM_FUNC_START truncdfsf2 +ARM_FUNC_ALIAS aeabi_d2f truncdfsf2 + + @ check exponent range. + mov r2, xh, lsl #1 + subs r3, r2, #((1023 - 127) << 21) + do_it cs, t + COND(sub,s,cs) ip, r3, #(1 << 21) + COND(rsb,s,cs) ip, ip, #(254 << 21) + bls 2f @ value is out of range + +1: @ shift and round mantissa + and ip, xh, #0x80000000 + mov r2, xl, lsl #3 + orr xl, ip, xl, lsr #29 + cmp r2, #0x80000000 + adc r0, xl, r3, lsl #2 + do_it eq + biceq r0, r0, #1 + RET + +2: @ either overflow or underflow + tst xh, #0x40000000 + bne 3f @ overflow + + @ check if denormalized value is possible + adds r2, r3, #(23 << 21) + do_it lt, t + andlt r0, xh, #0x80000000 @ too small, return signed 0. + RETc(lt) + + @ denormalize value so we can resume with the code above afterwards. + orr xh, xh, #0x00100000 + mov r2, r2, lsr #21 + rsb r2, r2, #24 + rsb ip, r2, #32 +#if defined(__thumb2__) + lsls r3, xl, ip +#else + movs r3, xl, lsl ip +#endif + shift1 lsr, xl, xl, r2 + do_it ne + orrne xl, xl, #1 @ fold r3 for rounding considerations. + mov r3, xh, lsl #11 + mov r3, r3, lsr #11 + shiftop orr xl xl r3 lsl ip ip + shift1 lsr, r3, r3, r2 + mov r3, r3, lsl #1 + b 1b + +3: @ chech for NAN + mvns r3, r2, asr #21 + bne 5f @ simple overflow + orrs r3, xl, xh, lsl #12 + do_it ne, tt + movne r0, #0x7f000000 + orrne r0, r0, #0x00c00000 + RETc(ne) @ return NAN + +5: @ return INF with sign + and r0, xh, #0x80000000 + orr r0, r0, #0x7f000000 + orr r0, r0, #0x00800000 + RET + + FUNC_END aeabi_d2f + FUNC_END truncdfsf2 + +#endif /* L_truncdfsf2 */ diff --git a/libgcc/config/arm/ieee754-sf.S b/libgcc/config/arm/ieee754-sf.S new file mode 100644 index 00000000000..c93f66d8ff8 --- /dev/null +++ b/libgcc/config/arm/ieee754-sf.S @@ -0,0 +1,1060 @@ +/* ieee754-sf.S single-precision floating point support for ARM + + Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. + Contributed by Nicolas Pitre (nico@cam.org) + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* + * Notes: + * + * The goal of this code is to be as fast as possible. This is + * not meant to be easy to understand for the casual reader. + * + * Only the default rounding mode is intended for best performances. + * Exceptions aren't supported yet, but that can be added quite easily + * if necessary without impacting performances. + */ + +#ifdef L_arm_negsf2 + +ARM_FUNC_START negsf2 +ARM_FUNC_ALIAS aeabi_fneg negsf2 + + eor r0, r0, #0x80000000 @ flip sign bit + RET + + FUNC_END aeabi_fneg + FUNC_END negsf2 + +#endif + +#ifdef L_arm_addsubsf3 + +ARM_FUNC_START aeabi_frsub + + eor r0, r0, #0x80000000 @ flip sign bit of first arg + b 1f + +ARM_FUNC_START subsf3 +ARM_FUNC_ALIAS aeabi_fsub subsf3 + + eor r1, r1, #0x80000000 @ flip sign bit of second arg +#if defined(__INTERWORKING_STUBS__) + b 1f @ Skip Thumb-code prologue +#endif + +ARM_FUNC_START addsf3 +ARM_FUNC_ALIAS aeabi_fadd addsf3 + +1: @ Look for zeroes, equal values, INF, or NAN. + movs r2, r0, lsl #1 + do_it ne, ttt + COND(mov,s,ne) r3, r1, lsl #1 + teqne r2, r3 + COND(mvn,s,ne) ip, r2, asr #24 + COND(mvn,s,ne) ip, r3, asr #24 + beq LSYM(Lad_s) + + @ Compute exponent difference. Make largest exponent in r2, + @ corresponding arg in r0, and positive exponent difference in r3. + mov r2, r2, lsr #24 + rsbs r3, r2, r3, lsr #24 + do_it gt, ttt + addgt r2, r2, r3 + eorgt r1, r0, r1 + eorgt r0, r1, r0 + eorgt r1, r0, r1 + do_it lt + rsblt r3, r3, #0 + + @ If exponent difference is too large, return largest argument + @ already in r0. We need up to 25 bit to handle proper rounding + @ of 0x1p25 - 1.1. + cmp r3, #25 + do_it hi + RETc(hi) + + @ Convert mantissa to signed integer. + tst r0, #0x80000000 + orr r0, r0, #0x00800000 + bic r0, r0, #0xff000000 + do_it ne + rsbne r0, r0, #0 + tst r1, #0x80000000 + orr r1, r1, #0x00800000 + bic r1, r1, #0xff000000 + do_it ne + rsbne r1, r1, #0 + + @ If exponent == difference, one or both args were denormalized. + @ Since this is not common case, rescale them off line. + teq r2, r3 + beq LSYM(Lad_d) +LSYM(Lad_x): + + @ Compensate for the exponent overlapping the mantissa MSB added later + sub r2, r2, #1 + + @ Shift and add second arg to first arg in r0. + @ Keep leftover bits into r1. + shiftop adds r0 r0 r1 asr r3 ip + rsb r3, r3, #32 + shift1 lsl, r1, r1, r3 + + @ Keep absolute value in r0-r1, sign in r3 (the n bit was set above) + and r3, r0, #0x80000000 + bpl LSYM(Lad_p) +#if defined(__thumb2__) + negs r1, r1 + sbc r0, r0, r0, lsl #1 +#else + rsbs r1, r1, #0 + rsc r0, r0, #0 +#endif + + @ Determine how to normalize the result. +LSYM(Lad_p): + cmp r0, #0x00800000 + bcc LSYM(Lad_a) + cmp r0, #0x01000000 + bcc LSYM(Lad_e) + + @ Result needs to be shifted right. + movs r0, r0, lsr #1 + mov r1, r1, rrx + add r2, r2, #1 + + @ Make sure we did not bust our exponent. + cmp r2, #254 + bhs LSYM(Lad_o) + + @ Our result is now properly aligned into r0, remaining bits in r1. + @ Pack final result together. + @ Round with MSB of r1. If halfway between two numbers, round towards + @ LSB of r0 = 0. +LSYM(Lad_e): + cmp r1, #0x80000000 + adc r0, r0, r2, lsl #23 + do_it eq + biceq r0, r0, #1 + orr r0, r0, r3 + RET + + @ Result must be shifted left and exponent adjusted. +LSYM(Lad_a): + movs r1, r1, lsl #1 + adc r0, r0, r0 + tst r0, #0x00800000 + sub r2, r2, #1 + bne LSYM(Lad_e) + + @ No rounding necessary since r1 will always be 0 at this point. +LSYM(Lad_l): + +#if __ARM_ARCH__ < 5 + + movs ip, r0, lsr #12 + moveq r0, r0, lsl #12 + subeq r2, r2, #12 + tst r0, #0x00ff0000 + moveq r0, r0, lsl #8 + subeq r2, r2, #8 + tst r0, #0x00f00000 + moveq r0, r0, lsl #4 + subeq r2, r2, #4 + tst r0, #0x00c00000 + moveq r0, r0, lsl #2 + subeq r2, r2, #2 + cmp r0, #0x00800000 + movcc r0, r0, lsl #1 + sbcs r2, r2, #0 + +#else + + clz ip, r0 + sub ip, ip, #8 + subs r2, r2, ip + shift1 lsl, r0, r0, ip + +#endif + + @ Final result with sign + @ If exponent negative, denormalize result. + do_it ge, et + addge r0, r0, r2, lsl #23 + rsblt r2, r2, #0 + orrge r0, r0, r3 +#if defined(__thumb2__) + do_it lt, t + lsrlt r0, r0, r2 + orrlt r0, r3, r0 +#else + orrlt r0, r3, r0, lsr r2 +#endif + RET + + @ Fixup and adjust bit position for denormalized arguments. + @ Note that r2 must not remain equal to 0. +LSYM(Lad_d): + teq r2, #0 + eor r1, r1, #0x00800000 + do_it eq, te + eoreq r0, r0, #0x00800000 + addeq r2, r2, #1 + subne r3, r3, #1 + b LSYM(Lad_x) + +LSYM(Lad_s): + mov r3, r1, lsl #1 + + mvns ip, r2, asr #24 + do_it ne + COND(mvn,s,ne) ip, r3, asr #24 + beq LSYM(Lad_i) + + teq r2, r3 + beq 1f + + @ Result is x + 0.0 = x or 0.0 + y = y. + teq r2, #0 + do_it eq + moveq r0, r1 + RET + +1: teq r0, r1 + + @ Result is x - x = 0. + do_it ne, t + movne r0, #0 + RETc(ne) + + @ Result is x + x = 2x. + tst r2, #0xff000000 + bne 2f + movs r0, r0, lsl #1 + do_it cs + orrcs r0, r0, #0x80000000 + RET +2: adds r2, r2, #(2 << 24) + do_it cc, t + addcc r0, r0, #(1 << 23) + RETc(cc) + and r3, r0, #0x80000000 + + @ Overflow: return INF. +LSYM(Lad_o): + orr r0, r3, #0x7f000000 + orr r0, r0, #0x00800000 + RET + + @ At least one of r0/r1 is INF/NAN. + @ if r0 != INF/NAN: return r1 (which is INF/NAN) + @ if r1 != INF/NAN: return r0 (which is INF/NAN) + @ if r0 or r1 is NAN: return NAN + @ if opposite sign: return NAN + @ otherwise return r0 (which is INF or -INF) +LSYM(Lad_i): + mvns r2, r2, asr #24 + do_it ne, et + movne r0, r1 + COND(mvn,s,eq) r3, r3, asr #24 + movne r1, r0 + movs r2, r0, lsl #9 + do_it eq, te + COND(mov,s,eq) r3, r1, lsl #9 + teqeq r0, r1 + orrne r0, r0, #0x00400000 @ quiet NAN + RET + + FUNC_END aeabi_frsub + FUNC_END aeabi_fadd + FUNC_END addsf3 + FUNC_END aeabi_fsub + FUNC_END subsf3 + +ARM_FUNC_START floatunsisf +ARM_FUNC_ALIAS aeabi_ui2f floatunsisf + + mov r3, #0 + b 1f + +ARM_FUNC_START floatsisf +ARM_FUNC_ALIAS aeabi_i2f floatsisf + + ands r3, r0, #0x80000000 + do_it mi + rsbmi r0, r0, #0 + +1: movs ip, r0 + do_it eq + RETc(eq) + + @ Add initial exponent to sign + orr r3, r3, #((127 + 23) << 23) + + .ifnc ah, r0 + mov ah, r0 + .endif + mov al, #0 + b 2f + + FUNC_END aeabi_i2f + FUNC_END floatsisf + FUNC_END aeabi_ui2f + FUNC_END floatunsisf + +ARM_FUNC_START floatundisf +ARM_FUNC_ALIAS aeabi_ul2f floatundisf + + orrs r2, r0, r1 +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + do_it eq, t + mvfeqs f0, #0.0 +#else + do_it eq +#endif + RETc(eq) + + mov r3, #0 + b 1f + +ARM_FUNC_START floatdisf +ARM_FUNC_ALIAS aeabi_l2f floatdisf + + orrs r2, r0, r1 +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + do_it eq, t + mvfeqs f0, #0.0 +#else + do_it eq +#endif + RETc(eq) + + ands r3, ah, #0x80000000 @ sign bit in r3 + bpl 1f +#if defined(__thumb2__) + negs al, al + sbc ah, ah, ah, lsl #1 +#else + rsbs al, al, #0 + rsc ah, ah, #0 +#endif +1: +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + @ For hard FPA code we want to return via the tail below so that + @ we can return the result in f0 as well as in r0 for backwards + @ compatibility. + str lr, [sp, #-8]! + adr lr, LSYM(f0_ret) +#endif + + movs ip, ah + do_it eq, tt + moveq ip, al + moveq ah, al + moveq al, #0 + + @ Add initial exponent to sign + orr r3, r3, #((127 + 23 + 32) << 23) + do_it eq + subeq r3, r3, #(32 << 23) +2: sub r3, r3, #(1 << 23) + +#if __ARM_ARCH__ < 5 + + mov r2, #23 + cmp ip, #(1 << 16) + do_it hs, t + movhs ip, ip, lsr #16 + subhs r2, r2, #16 + cmp ip, #(1 << 8) + do_it hs, t + movhs ip, ip, lsr #8 + subhs r2, r2, #8 + cmp ip, #(1 << 4) + do_it hs, t + movhs ip, ip, lsr #4 + subhs r2, r2, #4 + cmp ip, #(1 << 2) + do_it hs, e + subhs r2, r2, #2 + sublo r2, r2, ip, lsr #1 + subs r2, r2, ip, lsr #3 + +#else + + clz r2, ip + subs r2, r2, #8 + +#endif + + sub r3, r3, r2, lsl #23 + blt 3f + + shiftop add r3 r3 ah lsl r2 ip + shift1 lsl, ip, al, r2 + rsb r2, r2, #32 + cmp ip, #0x80000000 + shiftop adc r0 r3 al lsr r2 r2 + do_it eq + biceq r0, r0, #1 + RET + +3: add r2, r2, #32 + shift1 lsl, ip, ah, r2 + rsb r2, r2, #32 + orrs al, al, ip, lsl #1 + shiftop adc r0 r3 ah lsr r2 r2 + do_it eq + biceq r0, r0, ip, lsr #31 + RET + +#if !defined (__VFP_FP__) && !defined(__SOFTFP__) + +LSYM(f0_ret): + str r0, [sp, #-4]! + ldfs f0, [sp], #4 + RETLDM + +#endif + + FUNC_END floatdisf + FUNC_END aeabi_l2f + FUNC_END floatundisf + FUNC_END aeabi_ul2f + +#endif /* L_addsubsf3 */ + +#ifdef L_arm_muldivsf3 + +ARM_FUNC_START mulsf3 +ARM_FUNC_ALIAS aeabi_fmul mulsf3 + + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + ands r2, ip, r0, lsr #23 + do_it ne, tt + COND(and,s,ne) r3, ip, r1, lsr #23 + teqne r2, ip + teqne r3, ip + beq LSYM(Lml_s) +LSYM(Lml_x): + + @ Add exponents together + add r2, r2, r3 + + @ Determine final sign. + eor ip, r0, r1 + + @ Convert mantissa to unsigned integer. + @ If power of two, branch to a separate path. + @ Make up for final alignment. + movs r0, r0, lsl #9 + do_it ne + COND(mov,s,ne) r1, r1, lsl #9 + beq LSYM(Lml_1) + mov r3, #0x08000000 + orr r0, r3, r0, lsr #5 + orr r1, r3, r1, lsr #5 + +#if __ARM_ARCH__ < 4 + + @ Put sign bit in r3, which will be restored into r0 later. + and r3, ip, #0x80000000 + + @ Well, no way to make it shorter without the umull instruction. + do_push {r3, r4, r5} + mov r4, r0, lsr #16 + mov r5, r1, lsr #16 + bic r0, r0, r4, lsl #16 + bic r1, r1, r5, lsl #16 + mul ip, r4, r5 + mul r3, r0, r1 + mul r0, r5, r0 + mla r0, r4, r1, r0 + adds r3, r3, r0, lsl #16 + adc r1, ip, r0, lsr #16 + do_pop {r0, r4, r5} + +#else + + @ The actual multiplication. + umull r3, r1, r0, r1 + + @ Put final sign in r0. + and r0, ip, #0x80000000 + +#endif + + @ Adjust result upon the MSB position. + cmp r1, #(1 << 23) + do_it cc, tt + movcc r1, r1, lsl #1 + orrcc r1, r1, r3, lsr #31 + movcc r3, r3, lsl #1 + + @ Add sign to result. + orr r0, r0, r1 + + @ Apply exponent bias, check for under/overflow. + sbc r2, r2, #127 + cmp r2, #(254 - 1) + bhi LSYM(Lml_u) + + @ Round the result, merge final exponent. + cmp r3, #0x80000000 + adc r0, r0, r2, lsl #23 + do_it eq + biceq r0, r0, #1 + RET + + @ Multiplication by 0x1p*: let''s shortcut a lot of code. +LSYM(Lml_1): + teq r0, #0 + and ip, ip, #0x80000000 + do_it eq + moveq r1, r1, lsl #9 + orr r0, ip, r0, lsr #9 + orr r0, r0, r1, lsr #9 + subs r2, r2, #127 + do_it gt, tt + COND(rsb,s,gt) r3, r2, #255 + orrgt r0, r0, r2, lsl #23 + RETc(gt) + + @ Under/overflow: fix things up for the code below. + orr r0, r0, #0x00800000 + mov r3, #0 + subs r2, r2, #1 + +LSYM(Lml_u): + @ Overflow? + bgt LSYM(Lml_o) + + @ Check if denormalized result is possible, otherwise return signed 0. + cmn r2, #(24 + 1) + do_it le, t + bicle r0, r0, #0x7fffffff + RETc(le) + + @ Shift value right, round, etc. + rsb r2, r2, #0 + movs r1, r0, lsl #1 + shift1 lsr, r1, r1, r2 + rsb r2, r2, #32 + shift1 lsl, ip, r0, r2 + movs r0, r1, rrx + adc r0, r0, #0 + orrs r3, r3, ip, lsl #1 + do_it eq + biceq r0, r0, ip, lsr #31 + RET + + @ One or both arguments are denormalized. + @ Scale them leftwards and preserve sign bit. +LSYM(Lml_d): + teq r2, #0 + and ip, r0, #0x80000000 +1: do_it eq, tt + moveq r0, r0, lsl #1 + tsteq r0, #0x00800000 + subeq r2, r2, #1 + beq 1b + orr r0, r0, ip + teq r3, #0 + and ip, r1, #0x80000000 +2: do_it eq, tt + moveq r1, r1, lsl #1 + tsteq r1, #0x00800000 + subeq r3, r3, #1 + beq 2b + orr r1, r1, ip + b LSYM(Lml_x) + +LSYM(Lml_s): + @ Isolate the INF and NAN cases away + and r3, ip, r1, lsr #23 + teq r2, ip + do_it ne + teqne r3, ip + beq 1f + + @ Here, one or more arguments are either denormalized or zero. + bics ip, r0, #0x80000000 + do_it ne + COND(bic,s,ne) ip, r1, #0x80000000 + bne LSYM(Lml_d) + + @ Result is 0, but determine sign anyway. +LSYM(Lml_z): + eor r0, r0, r1 + bic r0, r0, #0x7fffffff + RET + +1: @ One or both args are INF or NAN. + teq r0, #0x0 + do_it ne, ett + teqne r0, #0x80000000 + moveq r0, r1 + teqne r1, #0x0 + teqne r1, #0x80000000 + beq LSYM(Lml_n) @ 0 * INF or INF * 0 -> NAN + teq r2, ip + bne 1f + movs r2, r0, lsl #9 + bne LSYM(Lml_n) @ NAN * <anything> -> NAN +1: teq r3, ip + bne LSYM(Lml_i) + movs r3, r1, lsl #9 + do_it ne + movne r0, r1 + bne LSYM(Lml_n) @ <anything> * NAN -> NAN + + @ Result is INF, but we need to determine its sign. +LSYM(Lml_i): + eor r0, r0, r1 + + @ Overflow: return INF (sign already in r0). +LSYM(Lml_o): + and r0, r0, #0x80000000 + orr r0, r0, #0x7f000000 + orr r0, r0, #0x00800000 + RET + + @ Return a quiet NAN. +LSYM(Lml_n): + orr r0, r0, #0x7f000000 + orr r0, r0, #0x00c00000 + RET + + FUNC_END aeabi_fmul + FUNC_END mulsf3 + +ARM_FUNC_START divsf3 +ARM_FUNC_ALIAS aeabi_fdiv divsf3 + + @ Mask out exponents, trap any zero/denormal/INF/NAN. + mov ip, #0xff + ands r2, ip, r0, lsr #23 + do_it ne, tt + COND(and,s,ne) r3, ip, r1, lsr #23 + teqne r2, ip + teqne r3, ip + beq LSYM(Ldv_s) +LSYM(Ldv_x): + + @ Substract divisor exponent from dividend''s + sub r2, r2, r3 + + @ Preserve final sign into ip. + eor ip, r0, r1 + + @ Convert mantissa to unsigned integer. + @ Dividend -> r3, divisor -> r1. + movs r1, r1, lsl #9 + mov r0, r0, lsl #9 + beq LSYM(Ldv_1) + mov r3, #0x10000000 + orr r1, r3, r1, lsr #4 + orr r3, r3, r0, lsr #4 + + @ Initialize r0 (result) with final sign bit. + and r0, ip, #0x80000000 + + @ Ensure result will land to known bit position. + @ Apply exponent bias accordingly. + cmp r3, r1 + do_it cc + movcc r3, r3, lsl #1 + adc r2, r2, #(127 - 2) + + @ The actual division loop. + mov ip, #0x00800000 +1: cmp r3, r1 + do_it cs, t + subcs r3, r3, r1 + orrcs r0, r0, ip + cmp r3, r1, lsr #1 + do_it cs, t + subcs r3, r3, r1, lsr #1 + orrcs r0, r0, ip, lsr #1 + cmp r3, r1, lsr #2 + do_it cs, t + subcs r3, r3, r1, lsr #2 + orrcs r0, r0, ip, lsr #2 + cmp r3, r1, lsr #3 + do_it cs, t + subcs r3, r3, r1, lsr #3 + orrcs r0, r0, ip, lsr #3 + movs r3, r3, lsl #4 + do_it ne + COND(mov,s,ne) ip, ip, lsr #4 + bne 1b + + @ Check exponent for under/overflow. + cmp r2, #(254 - 1) + bhi LSYM(Lml_u) + + @ Round the result, merge final exponent. + cmp r3, r1 + adc r0, r0, r2, lsl #23 + do_it eq + biceq r0, r0, #1 + RET + + @ Division by 0x1p*: let''s shortcut a lot of code. +LSYM(Ldv_1): + and ip, ip, #0x80000000 + orr r0, ip, r0, lsr #9 + adds r2, r2, #127 + do_it gt, tt + COND(rsb,s,gt) r3, r2, #255 + orrgt r0, r0, r2, lsl #23 + RETc(gt) + + orr r0, r0, #0x00800000 + mov r3, #0 + subs r2, r2, #1 + b LSYM(Lml_u) + + @ One or both arguments are denormalized. + @ Scale them leftwards and preserve sign bit. +LSYM(Ldv_d): + teq r2, #0 + and ip, r0, #0x80000000 +1: do_it eq, tt + moveq r0, r0, lsl #1 + tsteq r0, #0x00800000 + subeq r2, r2, #1 + beq 1b + orr r0, r0, ip + teq r3, #0 + and ip, r1, #0x80000000 +2: do_it eq, tt + moveq r1, r1, lsl #1 + tsteq r1, #0x00800000 + subeq r3, r3, #1 + beq 2b + orr r1, r1, ip + b LSYM(Ldv_x) + + @ One or both arguments are either INF, NAN, zero or denormalized. +LSYM(Ldv_s): + and r3, ip, r1, lsr #23 + teq r2, ip + bne 1f + movs r2, r0, lsl #9 + bne LSYM(Lml_n) @ NAN / <anything> -> NAN + teq r3, ip + bne LSYM(Lml_i) @ INF / <anything> -> INF + mov r0, r1 + b LSYM(Lml_n) @ INF / (INF or NAN) -> NAN +1: teq r3, ip + bne 2f + movs r3, r1, lsl #9 + beq LSYM(Lml_z) @ <anything> / INF -> 0 + mov r0, r1 + b LSYM(Lml_n) @ <anything> / NAN -> NAN +2: @ If both are nonzero, we need to normalize and resume above. + bics ip, r0, #0x80000000 + do_it ne + COND(bic,s,ne) ip, r1, #0x80000000 + bne LSYM(Ldv_d) + @ One or both arguments are zero. + bics r2, r0, #0x80000000 + bne LSYM(Lml_i) @ <non_zero> / 0 -> INF + bics r3, r1, #0x80000000 + bne LSYM(Lml_z) @ 0 / <non_zero> -> 0 + b LSYM(Lml_n) @ 0 / 0 -> NAN + + FUNC_END aeabi_fdiv + FUNC_END divsf3 + +#endif /* L_muldivsf3 */ + +#ifdef L_arm_cmpsf2 + + @ The return value in r0 is + @ + @ 0 if the operands are equal + @ 1 if the first operand is greater than the second, or + @ the operands are unordered and the operation is + @ CMP, LT, LE, NE, or EQ. + @ -1 if the first operand is less than the second, or + @ the operands are unordered and the operation is GT + @ or GE. + @ + @ The Z flag will be set iff the operands are equal. + @ + @ The following registers are clobbered by this function: + @ ip, r0, r1, r2, r3 + +ARM_FUNC_START gtsf2 +ARM_FUNC_ALIAS gesf2 gtsf2 + mov ip, #-1 + b 1f + +ARM_FUNC_START ltsf2 +ARM_FUNC_ALIAS lesf2 ltsf2 + mov ip, #1 + b 1f + +ARM_FUNC_START cmpsf2 +ARM_FUNC_ALIAS nesf2 cmpsf2 +ARM_FUNC_ALIAS eqsf2 cmpsf2 + mov ip, #1 @ how should we specify unordered here? + +1: str ip, [sp, #-4]! + + @ Trap any INF/NAN first. + mov r2, r0, lsl #1 + mov r3, r1, lsl #1 + mvns ip, r2, asr #24 + do_it ne + COND(mvn,s,ne) ip, r3, asr #24 + beq 3f + + @ Compare values. + @ Note that 0.0 is equal to -0.0. +2: add sp, sp, #4 + orrs ip, r2, r3, lsr #1 @ test if both are 0, clear C flag + do_it ne + teqne r0, r1 @ if not 0 compare sign + do_it pl + COND(sub,s,pl) r0, r2, r3 @ if same sign compare values, set r0 + + @ Result: + do_it hi + movhi r0, r1, asr #31 + do_it lo + mvnlo r0, r1, asr #31 + do_it ne + orrne r0, r0, #1 + RET + + @ Look for a NAN. +3: mvns ip, r2, asr #24 + bne 4f + movs ip, r0, lsl #9 + bne 5f @ r0 is NAN +4: mvns ip, r3, asr #24 + bne 2b + movs ip, r1, lsl #9 + beq 2b @ r1 is not NAN +5: ldr r0, [sp], #4 @ return unordered code. + RET + + FUNC_END gesf2 + FUNC_END gtsf2 + FUNC_END lesf2 + FUNC_END ltsf2 + FUNC_END nesf2 + FUNC_END eqsf2 + FUNC_END cmpsf2 + +ARM_FUNC_START aeabi_cfrcmple + + mov ip, r0 + mov r0, r1 + mov r1, ip + b 6f + +ARM_FUNC_START aeabi_cfcmpeq +ARM_FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq + + @ The status-returning routines are required to preserve all + @ registers except ip, lr, and cpsr. +6: do_push {r0, r1, r2, r3, lr} + ARM_CALL cmpsf2 + @ Set the Z flag correctly, and the C flag unconditionally. + cmp r0, #0 + @ Clear the C flag if the return value was -1, indicating + @ that the first operand was smaller than the second. + do_it mi + cmnmi r0, #0 + RETLDM "r0, r1, r2, r3" + + FUNC_END aeabi_cfcmple + FUNC_END aeabi_cfcmpeq + FUNC_END aeabi_cfrcmple + +ARM_FUNC_START aeabi_fcmpeq + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfcmple + do_it eq, e + moveq r0, #1 @ Equal to. + movne r0, #0 @ Less than, greater than, or unordered. + RETLDM + + FUNC_END aeabi_fcmpeq + +ARM_FUNC_START aeabi_fcmplt + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfcmple + do_it cc, e + movcc r0, #1 @ Less than. + movcs r0, #0 @ Equal to, greater than, or unordered. + RETLDM + + FUNC_END aeabi_fcmplt + +ARM_FUNC_START aeabi_fcmple + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfcmple + do_it ls, e + movls r0, #1 @ Less than or equal to. + movhi r0, #0 @ Greater than or unordered. + RETLDM + + FUNC_END aeabi_fcmple + +ARM_FUNC_START aeabi_fcmpge + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfrcmple + do_it ls, e + movls r0, #1 @ Operand 2 is less than or equal to operand 1. + movhi r0, #0 @ Operand 2 greater than operand 1, or unordered. + RETLDM + + FUNC_END aeabi_fcmpge + +ARM_FUNC_START aeabi_fcmpgt + + str lr, [sp, #-8]! + ARM_CALL aeabi_cfrcmple + do_it cc, e + movcc r0, #1 @ Operand 2 is less than operand 1. + movcs r0, #0 @ Operand 2 is greater than or equal to operand 1, + @ or they are unordered. + RETLDM + + FUNC_END aeabi_fcmpgt + +#endif /* L_cmpsf2 */ + +#ifdef L_arm_unordsf2 + +ARM_FUNC_START unordsf2 +ARM_FUNC_ALIAS aeabi_fcmpun unordsf2 + + mov r2, r0, lsl #1 + mov r3, r1, lsl #1 + mvns ip, r2, asr #24 + bne 1f + movs ip, r0, lsl #9 + bne 3f @ r0 is NAN +1: mvns ip, r3, asr #24 + bne 2f + movs ip, r1, lsl #9 + bne 3f @ r1 is NAN +2: mov r0, #0 @ arguments are ordered. + RET +3: mov r0, #1 @ arguments are unordered. + RET + + FUNC_END aeabi_fcmpun + FUNC_END unordsf2 + +#endif /* L_unordsf2 */ + +#ifdef L_arm_fixsfsi + +ARM_FUNC_START fixsfsi +ARM_FUNC_ALIAS aeabi_f2iz fixsfsi + + @ check exponent range. + mov r2, r0, lsl #1 + cmp r2, #(127 << 24) + bcc 1f @ value is too small + mov r3, #(127 + 31) + subs r2, r3, r2, lsr #24 + bls 2f @ value is too large + + @ scale value + mov r3, r0, lsl #8 + orr r3, r3, #0x80000000 + tst r0, #0x80000000 @ the sign bit + shift1 lsr, r0, r3, r2 + do_it ne + rsbne r0, r0, #0 + RET + +1: mov r0, #0 + RET + +2: cmp r2, #(127 + 31 - 0xff) + bne 3f + movs r2, r0, lsl #9 + bne 4f @ r0 is NAN. +3: ands r0, r0, #0x80000000 @ the sign bit + do_it eq + moveq r0, #0x7fffffff @ the maximum signed positive si + RET + +4: mov r0, #0 @ What should we convert NAN to? + RET + + FUNC_END aeabi_f2iz + FUNC_END fixsfsi + +#endif /* L_fixsfsi */ + +#ifdef L_arm_fixunssfsi + +ARM_FUNC_START fixunssfsi +ARM_FUNC_ALIAS aeabi_f2uiz fixunssfsi + + @ check exponent range. + movs r2, r0, lsl #1 + bcs 1f @ value is negative + cmp r2, #(127 << 24) + bcc 1f @ value is too small + mov r3, #(127 + 31) + subs r2, r3, r2, lsr #24 + bmi 2f @ value is too large + + @ scale the value + mov r3, r0, lsl #8 + orr r3, r3, #0x80000000 + shift1 lsr, r0, r3, r2 + RET + +1: mov r0, #0 + RET + +2: cmp r2, #(127 + 31 - 0xff) + bne 3f + movs r2, r0, lsl #9 + bne 4f @ r0 is NAN. +3: mov r0, #0xffffffff @ maximum unsigned si + RET + +4: mov r0, #0 @ What should we convert NAN to? + RET + + FUNC_END aeabi_f2uiz + FUNC_END fixunssfsi + +#endif /* L_fixunssfsi */ diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S new file mode 100644 index 00000000000..2e76c01df4b --- /dev/null +++ b/libgcc/config/arm/lib1funcs.S @@ -0,0 +1,1829 @@ +@ libgcc routines for ARM cpu. +@ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk) + +/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005, 2007, 2008, + 2009, 2010 Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +/* An executable stack is *not* required for these functions. */ +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +.previous +#endif /* __ELF__ and __linux__ */ + +#ifdef __ARM_EABI__ +/* Some attributes that are common to all routines in this file. */ + /* Tag_ABI_align_needed: This code does not require 8-byte + alignment from the caller. */ + /* .eabi_attribute 24, 0 -- default setting. */ + /* Tag_ABI_align_preserved: This code preserves 8-byte + alignment in any callee. */ + .eabi_attribute 25, 1 +#endif /* __ARM_EABI__ */ +/* ------------------------------------------------------------------------ */ + +/* We need to know what prefix to add to function names. */ + +#ifndef __USER_LABEL_PREFIX__ +#error __USER_LABEL_PREFIX__ not defined +#endif + +/* ANSI concatenation macros. */ + +#define CONCAT1(a, b) CONCAT2(a, b) +#define CONCAT2(a, b) a ## b + +/* Use the right prefix for global labels. */ + +#define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x) + +#ifdef __ELF__ +#ifdef __thumb__ +#define __PLT__ /* Not supported in Thumb assembler (for now). */ +#elif defined __vxworks && !defined __PIC__ +#define __PLT__ /* Not supported by the kernel loader. */ +#else +#define __PLT__ (PLT) +#endif +#define TYPE(x) .type SYM(x),function +#define SIZE(x) .size SYM(x), . - SYM(x) +#define LSYM(x) .x +#else +#define __PLT__ +#define TYPE(x) +#define SIZE(x) +#define LSYM(x) x +#endif + +/* Function end macros. Variants for interworking. */ + +#if defined(__ARM_ARCH_2__) +# define __ARM_ARCH__ 2 +#endif + +#if defined(__ARM_ARCH_3__) +# define __ARM_ARCH__ 3 +#endif + +#if defined(__ARM_ARCH_3M__) || defined(__ARM_ARCH_4__) \ + || defined(__ARM_ARCH_4T__) +/* We use __ARM_ARCH__ set to 4 here, but in reality it's any processor with + long multiply instructions. That includes v3M. */ +# define __ARM_ARCH__ 4 +#endif + +#if defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) \ + || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \ + || defined(__ARM_ARCH_5TEJ__) +# define __ARM_ARCH__ 5 +#endif + +#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \ + || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_6M__) +# define __ARM_ARCH__ 6 +#endif + +#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__) +# define __ARM_ARCH__ 7 +#endif + +#ifndef __ARM_ARCH__ +#error Unable to determine architecture. +#endif + +/* There are times when we might prefer Thumb1 code even if ARM code is + permitted, for example, the code might be smaller, or there might be + interworking problems with switching to ARM state if interworking is + disabled. */ +#if (defined(__thumb__) \ + && !defined(__thumb2__) \ + && (!defined(__THUMB_INTERWORK__) \ + || defined (__OPTIMIZE_SIZE__) \ + || defined(__ARM_ARCH_6M__))) +# define __prefer_thumb__ +#endif + +/* How to return from a function call depends on the architecture variant. */ + +#if (__ARM_ARCH__ > 4) || defined(__ARM_ARCH_4T__) + +# define RET bx lr +# define RETc(x) bx##x lr + +/* Special precautions for interworking on armv4t. */ +# if (__ARM_ARCH__ == 4) + +/* Always use bx, not ldr pc. */ +# if (defined(__thumb__) || defined(__THUMB_INTERWORK__)) +# define __INTERWORKING__ +# endif /* __THUMB__ || __THUMB_INTERWORK__ */ + +/* Include thumb stub before arm mode code. */ +# if defined(__thumb__) && !defined(__THUMB_INTERWORK__) +# define __INTERWORKING_STUBS__ +# endif /* __thumb__ && !__THUMB_INTERWORK__ */ + +#endif /* __ARM_ARCH == 4 */ + +#else + +# define RET mov pc, lr +# define RETc(x) mov##x pc, lr + +#endif + +.macro cfi_pop advance, reg, cfa_offset +#ifdef __ELF__ + .pushsection .debug_frame + .byte 0x4 /* DW_CFA_advance_loc4 */ + .4byte \advance + .byte (0xc0 | \reg) /* DW_CFA_restore */ + .byte 0xe /* DW_CFA_def_cfa_offset */ + .uleb128 \cfa_offset + .popsection +#endif +.endm +.macro cfi_push advance, reg, offset, cfa_offset +#ifdef __ELF__ + .pushsection .debug_frame + .byte 0x4 /* DW_CFA_advance_loc4 */ + .4byte \advance + .byte (0x80 | \reg) /* DW_CFA_offset */ + .uleb128 (\offset / -4) + .byte 0xe /* DW_CFA_def_cfa_offset */ + .uleb128 \cfa_offset + .popsection +#endif +.endm +.macro cfi_start start_label, end_label +#ifdef __ELF__ + .pushsection .debug_frame +LSYM(Lstart_frame): + .4byte LSYM(Lend_cie) - LSYM(Lstart_cie) @ Length of CIE +LSYM(Lstart_cie): + .4byte 0xffffffff @ CIE Identifier Tag + .byte 0x1 @ CIE Version + .ascii "\0" @ CIE Augmentation + .uleb128 0x1 @ CIE Code Alignment Factor + .sleb128 -4 @ CIE Data Alignment Factor + .byte 0xe @ CIE RA Column + .byte 0xc @ DW_CFA_def_cfa + .uleb128 0xd + .uleb128 0x0 + + .align 2 +LSYM(Lend_cie): + .4byte LSYM(Lend_fde)-LSYM(Lstart_fde) @ FDE Length +LSYM(Lstart_fde): + .4byte LSYM(Lstart_frame) @ FDE CIE offset + .4byte \start_label @ FDE initial location + .4byte \end_label-\start_label @ FDE address range + .popsection +#endif +.endm +.macro cfi_end end_label +#ifdef __ELF__ + .pushsection .debug_frame + .align 2 +LSYM(Lend_fde): + .popsection +\end_label: +#endif +.endm + +/* Don't pass dirn, it's there just to get token pasting right. */ + +.macro RETLDM regs=, cond=, unwind=, dirn=ia +#if defined (__INTERWORKING__) + .ifc "\regs","" + ldr\cond lr, [sp], #8 + .else +# if defined(__thumb2__) + pop\cond {\regs, lr} +# else + ldm\cond\dirn sp!, {\regs, lr} +# endif + .endif + .ifnc "\unwind", "" + /* Mark LR as restored. */ +97: cfi_pop 97b - \unwind, 0xe, 0x0 + .endif + bx\cond lr +#else + /* Caller is responsible for providing IT instruction. */ + .ifc "\regs","" + ldr\cond pc, [sp], #8 + .else +# if defined(__thumb2__) + pop\cond {\regs, pc} +# else + ldm\cond\dirn sp!, {\regs, pc} +# endif + .endif +#endif +.endm + +/* The Unified assembly syntax allows the same code to be assembled for both + ARM and Thumb-2. However this is only supported by recent gas, so define + a set of macros to allow ARM code on older assemblers. */ +#if defined(__thumb2__) +.macro do_it cond, suffix="" + it\suffix \cond +.endm +.macro shift1 op, arg0, arg1, arg2 + \op \arg0, \arg1, \arg2 +.endm +#define do_push push +#define do_pop pop +#define COND(op1, op2, cond) op1 ## op2 ## cond +/* Perform an arithmetic operation with a variable shift operand. This + requires two instructions and a scratch register on Thumb-2. */ +.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp + \shiftop \tmp, \src2, \shiftreg + \name \dest, \src1, \tmp +.endm +#else +.macro do_it cond, suffix="" +.endm +.macro shift1 op, arg0, arg1, arg2 + mov \arg0, \arg1, \op \arg2 +.endm +#define do_push stmfd sp!, +#define do_pop ldmfd sp!, +#define COND(op1, op2, cond) op1 ## cond ## op2 +.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp + \name \dest, \src1, \src2, \shiftop \shiftreg +.endm +#endif + +#ifdef __ARM_EABI__ +.macro ARM_LDIV0 name signed + cmp r0, #0 + .ifc \signed, unsigned + movne r0, #0xffffffff + .else + movgt r0, #0x7fffffff + movlt r0, #0x80000000 + .endif + b SYM (__aeabi_idiv0) __PLT__ +.endm +#else +.macro ARM_LDIV0 name signed + str lr, [sp, #-8]! +98: cfi_push 98b - __\name, 0xe, -0x8, 0x8 + bl SYM (__div0) __PLT__ + mov r0, #0 @ About as wrong as it could be. + RETLDM unwind=98b +.endm +#endif + + +#ifdef __ARM_EABI__ +.macro THUMB_LDIV0 name signed +#if defined(__ARM_ARCH_6M__) + .ifc \signed, unsigned + cmp r0, #0 + beq 1f + mov r0, #0 + mvn r0, r0 @ 0xffffffff +1: + .else + cmp r0, #0 + beq 2f + blt 3f + mov r0, #0 + mvn r0, r0 + lsr r0, r0, #1 @ 0x7fffffff + b 2f +3: mov r0, #0x80 + lsl r0, r0, #24 @ 0x80000000 +2: + .endif + push {r0, r1, r2} + ldr r0, 4f + adr r1, 4f + add r0, r1 + str r0, [sp, #8] + @ We know we are not on armv4t, so pop pc is safe. + pop {r0, r1, pc} + .align 2 +4: + .word __aeabi_idiv0 - 4b +#elif defined(__thumb2__) + .syntax unified + .ifc \signed, unsigned + cbz r0, 1f + mov r0, #0xffffffff +1: + .else + cmp r0, #0 + do_it gt + movgt r0, #0x7fffffff + do_it lt + movlt r0, #0x80000000 + .endif + b.w SYM(__aeabi_idiv0) __PLT__ +#else + .align 2 + bx pc + nop + .arm + cmp r0, #0 + .ifc \signed, unsigned + movne r0, #0xffffffff + .else + movgt r0, #0x7fffffff + movlt r0, #0x80000000 + .endif + b SYM(__aeabi_idiv0) __PLT__ + .thumb +#endif +.endm +#else +.macro THUMB_LDIV0 name signed + push { r1, lr } +98: cfi_push 98b - __\name, 0xe, -0x4, 0x8 + bl SYM (__div0) + mov r0, #0 @ About as wrong as it could be. +#if defined (__INTERWORKING__) + pop { r1, r2 } + bx r2 +#else + pop { r1, pc } +#endif +.endm +#endif + +.macro FUNC_END name + SIZE (__\name) +.endm + +.macro DIV_FUNC_END name signed + cfi_start __\name, LSYM(Lend_div0) +LSYM(Ldiv0): +#ifdef __thumb__ + THUMB_LDIV0 \name \signed +#else + ARM_LDIV0 \name \signed +#endif + cfi_end LSYM(Lend_div0) + FUNC_END \name +.endm + +.macro THUMB_FUNC_START name + .globl SYM (\name) + TYPE (\name) + .thumb_func +SYM (\name): +.endm + +/* Function start macros. Variants for ARM and Thumb. */ + +#ifdef __thumb__ +#define THUMB_FUNC .thumb_func +#define THUMB_CODE .force_thumb +# if defined(__thumb2__) +#define THUMB_SYNTAX .syntax divided +# else +#define THUMB_SYNTAX +# endif +#else +#define THUMB_FUNC +#define THUMB_CODE +#define THUMB_SYNTAX +#endif + +.macro FUNC_START name + .text + .globl SYM (__\name) + TYPE (__\name) + .align 0 + THUMB_CODE + THUMB_FUNC + THUMB_SYNTAX +SYM (__\name): +.endm + +/* Special function that will always be coded in ARM assembly, even if + in Thumb-only compilation. */ + +#if defined(__thumb2__) + +/* For Thumb-2 we build everything in thumb mode. */ +.macro ARM_FUNC_START name + FUNC_START \name + .syntax unified +.endm +#define EQUIV .thumb_set +.macro ARM_CALL name + bl __\name +.endm + +#elif defined(__INTERWORKING_STUBS__) + +.macro ARM_FUNC_START name + FUNC_START \name + bx pc + nop + .arm +/* A hook to tell gdb that we've switched to ARM mode. Also used to call + directly from other local arm routines. */ +_L__\name: +.endm +#define EQUIV .thumb_set +/* Branch directly to a function declared with ARM_FUNC_START. + Must be called in arm mode. */ +.macro ARM_CALL name + bl _L__\name +.endm + +#else /* !(__INTERWORKING_STUBS__ || __thumb2__) */ + +#ifdef __ARM_ARCH_6M__ +#define EQUIV .thumb_set +#else +.macro ARM_FUNC_START name + .text + .globl SYM (__\name) + TYPE (__\name) + .align 0 + .arm +SYM (__\name): +.endm +#define EQUIV .set +.macro ARM_CALL name + bl __\name +.endm +#endif + +#endif + +.macro FUNC_ALIAS new old + .globl SYM (__\new) +#if defined (__thumb__) + .thumb_set SYM (__\new), SYM (__\old) +#else + .set SYM (__\new), SYM (__\old) +#endif +.endm + +#ifndef __ARM_ARCH_6M__ +.macro ARM_FUNC_ALIAS new old + .globl SYM (__\new) + EQUIV SYM (__\new), SYM (__\old) +#if defined(__INTERWORKING_STUBS__) + .set SYM (_L__\new), SYM (_L__\old) +#endif +.endm +#endif + +#ifdef __ARMEB__ +#define xxh r0 +#define xxl r1 +#define yyh r2 +#define yyl r3 +#else +#define xxh r1 +#define xxl r0 +#define yyh r3 +#define yyl r2 +#endif + +#ifdef __ARM_EABI__ +.macro WEAK name + .weak SYM (__\name) +.endm +#endif + +#ifdef __thumb__ +/* Register aliases. */ + +work .req r4 @ XXXX is this safe ? +dividend .req r0 +divisor .req r1 +overdone .req r2 +result .req r2 +curbit .req r3 +#endif +#if 0 +ip .req r12 +sp .req r13 +lr .req r14 +pc .req r15 +#endif + +/* ------------------------------------------------------------------------ */ +/* Bodies of the division and modulo routines. */ +/* ------------------------------------------------------------------------ */ +.macro ARM_DIV_BODY dividend, divisor, result, curbit + +#if __ARM_ARCH__ >= 5 && ! defined (__OPTIMIZE_SIZE__) + +#if defined (__thumb2__) + clz \curbit, \dividend + clz \result, \divisor + sub \curbit, \result, \curbit + rsb \curbit, \curbit, #31 + adr \result, 1f + add \curbit, \result, \curbit, lsl #4 + mov \result, #0 + mov pc, \curbit +.p2align 3 +1: + .set shift, 32 + .rept 32 + .set shift, shift - 1 + cmp.w \dividend, \divisor, lsl #shift + nop.n + adc.w \result, \result, \result + it cs + subcs.w \dividend, \dividend, \divisor, lsl #shift + .endr +#else + clz \curbit, \dividend + clz \result, \divisor + sub \curbit, \result, \curbit + rsbs \curbit, \curbit, #31 + addne \curbit, \curbit, \curbit, lsl #1 + mov \result, #0 + addne pc, pc, \curbit, lsl #2 + nop + .set shift, 32 + .rept 32 + .set shift, shift - 1 + cmp \dividend, \divisor, lsl #shift + adc \result, \result, \result + subcs \dividend, \dividend, \divisor, lsl #shift + .endr +#endif + +#else /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */ +#if __ARM_ARCH__ >= 5 + + clz \curbit, \divisor + clz \result, \dividend + sub \result, \curbit, \result + mov \curbit, #1 + mov \divisor, \divisor, lsl \result + mov \curbit, \curbit, lsl \result + mov \result, #0 + +#else /* __ARM_ARCH__ < 5 */ + + @ Initially shift the divisor left 3 bits if possible, + @ set curbit accordingly. This allows for curbit to be located + @ at the left end of each 4-bit nibbles in the division loop + @ to save one loop in most cases. + tst \divisor, #0xe0000000 + moveq \divisor, \divisor, lsl #3 + moveq \curbit, #8 + movne \curbit, #1 + + @ Unless the divisor is very big, shift it up in multiples of + @ four bits, since this is the amount of unwinding in the main + @ division loop. Continue shifting until the divisor is + @ larger than the dividend. +1: cmp \divisor, #0x10000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #4 + movlo \curbit, \curbit, lsl #4 + blo 1b + + @ For very big divisors, we must shift it a bit at a time, or + @ we will be in danger of overflowing. +1: cmp \divisor, #0x80000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #1 + movlo \curbit, \curbit, lsl #1 + blo 1b + + mov \result, #0 + +#endif /* __ARM_ARCH__ < 5 */ + + @ Division loop +1: cmp \dividend, \divisor + do_it hs, t + subhs \dividend, \dividend, \divisor + orrhs \result, \result, \curbit + cmp \dividend, \divisor, lsr #1 + do_it hs, t + subhs \dividend, \dividend, \divisor, lsr #1 + orrhs \result, \result, \curbit, lsr #1 + cmp \dividend, \divisor, lsr #2 + do_it hs, t + subhs \dividend, \dividend, \divisor, lsr #2 + orrhs \result, \result, \curbit, lsr #2 + cmp \dividend, \divisor, lsr #3 + do_it hs, t + subhs \dividend, \dividend, \divisor, lsr #3 + orrhs \result, \result, \curbit, lsr #3 + cmp \dividend, #0 @ Early termination? + do_it ne, t + movnes \curbit, \curbit, lsr #4 @ No, any more bits to do? + movne \divisor, \divisor, lsr #4 + bne 1b + +#endif /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */ + +.endm +/* ------------------------------------------------------------------------ */ +.macro ARM_DIV2_ORDER divisor, order + +#if __ARM_ARCH__ >= 5 + + clz \order, \divisor + rsb \order, \order, #31 + +#else + + cmp \divisor, #(1 << 16) + movhs \divisor, \divisor, lsr #16 + movhs \order, #16 + movlo \order, #0 + + cmp \divisor, #(1 << 8) + movhs \divisor, \divisor, lsr #8 + addhs \order, \order, #8 + + cmp \divisor, #(1 << 4) + movhs \divisor, \divisor, lsr #4 + addhs \order, \order, #4 + + cmp \divisor, #(1 << 2) + addhi \order, \order, #3 + addls \order, \order, \divisor, lsr #1 + +#endif + +.endm +/* ------------------------------------------------------------------------ */ +.macro ARM_MOD_BODY dividend, divisor, order, spare + +#if __ARM_ARCH__ >= 5 && ! defined (__OPTIMIZE_SIZE__) + + clz \order, \divisor + clz \spare, \dividend + sub \order, \order, \spare + rsbs \order, \order, #31 + addne pc, pc, \order, lsl #3 + nop + .set shift, 32 + .rept 32 + .set shift, shift - 1 + cmp \dividend, \divisor, lsl #shift + subcs \dividend, \dividend, \divisor, lsl #shift + .endr + +#else /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */ +#if __ARM_ARCH__ >= 5 + + clz \order, \divisor + clz \spare, \dividend + sub \order, \order, \spare + mov \divisor, \divisor, lsl \order + +#else /* __ARM_ARCH__ < 5 */ + + mov \order, #0 + + @ Unless the divisor is very big, shift it up in multiples of + @ four bits, since this is the amount of unwinding in the main + @ division loop. Continue shifting until the divisor is + @ larger than the dividend. +1: cmp \divisor, #0x10000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #4 + addlo \order, \order, #4 + blo 1b + + @ For very big divisors, we must shift it a bit at a time, or + @ we will be in danger of overflowing. +1: cmp \divisor, #0x80000000 + cmplo \divisor, \dividend + movlo \divisor, \divisor, lsl #1 + addlo \order, \order, #1 + blo 1b + +#endif /* __ARM_ARCH__ < 5 */ + + @ Perform all needed substractions to keep only the reminder. + @ Do comparisons in batch of 4 first. + subs \order, \order, #3 @ yes, 3 is intended here + blt 2f + +1: cmp \dividend, \divisor + subhs \dividend, \dividend, \divisor + cmp \dividend, \divisor, lsr #1 + subhs \dividend, \dividend, \divisor, lsr #1 + cmp \dividend, \divisor, lsr #2 + subhs \dividend, \dividend, \divisor, lsr #2 + cmp \dividend, \divisor, lsr #3 + subhs \dividend, \dividend, \divisor, lsr #3 + cmp \dividend, #1 + mov \divisor, \divisor, lsr #4 + subges \order, \order, #4 + bge 1b + + tst \order, #3 + teqne \dividend, #0 + beq 5f + + @ Either 1, 2 or 3 comparison/substractions are left. +2: cmn \order, #2 + blt 4f + beq 3f + cmp \dividend, \divisor + subhs \dividend, \dividend, \divisor + mov \divisor, \divisor, lsr #1 +3: cmp \dividend, \divisor + subhs \dividend, \dividend, \divisor + mov \divisor, \divisor, lsr #1 +4: cmp \dividend, \divisor + subhs \dividend, \dividend, \divisor +5: + +#endif /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */ + +.endm +/* ------------------------------------------------------------------------ */ +.macro THUMB_DIV_MOD_BODY modulo + @ Load the constant 0x10000000 into our work register. + mov work, #1 + lsl work, #28 +LSYM(Loop1): + @ Unless the divisor is very big, shift it up in multiples of + @ four bits, since this is the amount of unwinding in the main + @ division loop. Continue shifting until the divisor is + @ larger than the dividend. + cmp divisor, work + bhs LSYM(Lbignum) + cmp divisor, dividend + bhs LSYM(Lbignum) + lsl divisor, #4 + lsl curbit, #4 + b LSYM(Loop1) +LSYM(Lbignum): + @ Set work to 0x80000000 + lsl work, #3 +LSYM(Loop2): + @ For very big divisors, we must shift it a bit at a time, or + @ we will be in danger of overflowing. + cmp divisor, work + bhs LSYM(Loop3) + cmp divisor, dividend + bhs LSYM(Loop3) + lsl divisor, #1 + lsl curbit, #1 + b LSYM(Loop2) +LSYM(Loop3): + @ Test for possible subtractions ... + .if \modulo + @ ... On the final pass, this may subtract too much from the dividend, + @ so keep track of which subtractions are done, we can fix them up + @ afterwards. + mov overdone, #0 + cmp dividend, divisor + blo LSYM(Lover1) + sub dividend, dividend, divisor +LSYM(Lover1): + lsr work, divisor, #1 + cmp dividend, work + blo LSYM(Lover2) + sub dividend, dividend, work + mov ip, curbit + mov work, #1 + ror curbit, work + orr overdone, curbit + mov curbit, ip +LSYM(Lover2): + lsr work, divisor, #2 + cmp dividend, work + blo LSYM(Lover3) + sub dividend, dividend, work + mov ip, curbit + mov work, #2 + ror curbit, work + orr overdone, curbit + mov curbit, ip +LSYM(Lover3): + lsr work, divisor, #3 + cmp dividend, work + blo LSYM(Lover4) + sub dividend, dividend, work + mov ip, curbit + mov work, #3 + ror curbit, work + orr overdone, curbit + mov curbit, ip +LSYM(Lover4): + mov ip, curbit + .else + @ ... and note which bits are done in the result. On the final pass, + @ this may subtract too much from the dividend, but the result will be ok, + @ since the "bit" will have been shifted out at the bottom. + cmp dividend, divisor + blo LSYM(Lover1) + sub dividend, dividend, divisor + orr result, result, curbit +LSYM(Lover1): + lsr work, divisor, #1 + cmp dividend, work + blo LSYM(Lover2) + sub dividend, dividend, work + lsr work, curbit, #1 + orr result, work +LSYM(Lover2): + lsr work, divisor, #2 + cmp dividend, work + blo LSYM(Lover3) + sub dividend, dividend, work + lsr work, curbit, #2 + orr result, work +LSYM(Lover3): + lsr work, divisor, #3 + cmp dividend, work + blo LSYM(Lover4) + sub dividend, dividend, work + lsr work, curbit, #3 + orr result, work +LSYM(Lover4): + .endif + + cmp dividend, #0 @ Early termination? + beq LSYM(Lover5) + lsr curbit, #4 @ No, any more bits to do? + beq LSYM(Lover5) + lsr divisor, #4 + b LSYM(Loop3) +LSYM(Lover5): + .if \modulo + @ Any subtractions that we should not have done will be recorded in + @ the top three bits of "overdone". Exactly which were not needed + @ are governed by the position of the bit, stored in ip. + mov work, #0xe + lsl work, #28 + and overdone, work + beq LSYM(Lgot_result) + + @ If we terminated early, because dividend became zero, then the + @ bit in ip will not be in the bottom nibble, and we should not + @ perform the additions below. We must test for this though + @ (rather relying upon the TSTs to prevent the additions) since + @ the bit in ip could be in the top two bits which might then match + @ with one of the smaller RORs. + mov curbit, ip + mov work, #0x7 + tst curbit, work + beq LSYM(Lgot_result) + + mov curbit, ip + mov work, #3 + ror curbit, work + tst overdone, curbit + beq LSYM(Lover6) + lsr work, divisor, #3 + add dividend, work +LSYM(Lover6): + mov curbit, ip + mov work, #2 + ror curbit, work + tst overdone, curbit + beq LSYM(Lover7) + lsr work, divisor, #2 + add dividend, work +LSYM(Lover7): + mov curbit, ip + mov work, #1 + ror curbit, work + tst overdone, curbit + beq LSYM(Lgot_result) + lsr work, divisor, #1 + add dividend, work + .endif +LSYM(Lgot_result): +.endm +/* ------------------------------------------------------------------------ */ +/* Start of the Real Functions */ +/* ------------------------------------------------------------------------ */ +#ifdef L_udivsi3 + +#if defined(__prefer_thumb__) + + FUNC_START udivsi3 + FUNC_ALIAS aeabi_uidiv udivsi3 + + cmp divisor, #0 + beq LSYM(Ldiv0) +LSYM(udivsi3_skip_div0_test): + mov curbit, #1 + mov result, #0 + + push { work } + cmp dividend, divisor + blo LSYM(Lgot_result) + + THUMB_DIV_MOD_BODY 0 + + mov r0, result + pop { work } + RET + +#else /* ARM version/Thumb-2. */ + + ARM_FUNC_START udivsi3 + ARM_FUNC_ALIAS aeabi_uidiv udivsi3 + + /* Note: if called via udivsi3_skip_div0_test, this will unnecessarily + check for division-by-zero a second time. */ +LSYM(udivsi3_skip_div0_test): + subs r2, r1, #1 + do_it eq + RETc(eq) + bcc LSYM(Ldiv0) + cmp r0, r1 + bls 11f + tst r1, r2 + beq 12f + + ARM_DIV_BODY r0, r1, r2, r3 + + mov r0, r2 + RET + +11: do_it eq, e + moveq r0, #1 + movne r0, #0 + RET + +12: ARM_DIV2_ORDER r1, r2 + + mov r0, r0, lsr r2 + RET + +#endif /* ARM version */ + + DIV_FUNC_END udivsi3 unsigned + +#if defined(__prefer_thumb__) +FUNC_START aeabi_uidivmod + cmp r1, #0 + beq LSYM(Ldiv0) + push {r0, r1, lr} + bl LSYM(udivsi3_skip_div0_test) + POP {r1, r2, r3} + mul r2, r0 + sub r1, r1, r2 + bx r3 +#else +ARM_FUNC_START aeabi_uidivmod + cmp r1, #0 + beq LSYM(Ldiv0) + stmfd sp!, { r0, r1, lr } + bl LSYM(udivsi3_skip_div0_test) + ldmfd sp!, { r1, r2, lr } + mul r3, r2, r0 + sub r1, r1, r3 + RET +#endif + FUNC_END aeabi_uidivmod + +#endif /* L_udivsi3 */ +/* ------------------------------------------------------------------------ */ +#ifdef L_umodsi3 + + FUNC_START umodsi3 + +#ifdef __thumb__ + + cmp divisor, #0 + beq LSYM(Ldiv0) + mov curbit, #1 + cmp dividend, divisor + bhs LSYM(Lover10) + RET + +LSYM(Lover10): + push { work } + + THUMB_DIV_MOD_BODY 1 + + pop { work } + RET + +#else /* ARM version. */ + + subs r2, r1, #1 @ compare divisor with 1 + bcc LSYM(Ldiv0) + cmpne r0, r1 @ compare dividend with divisor + moveq r0, #0 + tsthi r1, r2 @ see if divisor is power of 2 + andeq r0, r0, r2 + RETc(ls) + + ARM_MOD_BODY r0, r1, r2, r3 + + RET + +#endif /* ARM version. */ + + DIV_FUNC_END umodsi3 unsigned + +#endif /* L_umodsi3 */ +/* ------------------------------------------------------------------------ */ +#ifdef L_divsi3 + +#if defined(__prefer_thumb__) + + FUNC_START divsi3 + FUNC_ALIAS aeabi_idiv divsi3 + + cmp divisor, #0 + beq LSYM(Ldiv0) +LSYM(divsi3_skip_div0_test): + push { work } + mov work, dividend + eor work, divisor @ Save the sign of the result. + mov ip, work + mov curbit, #1 + mov result, #0 + cmp divisor, #0 + bpl LSYM(Lover10) + neg divisor, divisor @ Loops below use unsigned. +LSYM(Lover10): + cmp dividend, #0 + bpl LSYM(Lover11) + neg dividend, dividend +LSYM(Lover11): + cmp dividend, divisor + blo LSYM(Lgot_result) + + THUMB_DIV_MOD_BODY 0 + + mov r0, result + mov work, ip + cmp work, #0 + bpl LSYM(Lover12) + neg r0, r0 +LSYM(Lover12): + pop { work } + RET + +#else /* ARM/Thumb-2 version. */ + + ARM_FUNC_START divsi3 + ARM_FUNC_ALIAS aeabi_idiv divsi3 + + cmp r1, #0 + beq LSYM(Ldiv0) +LSYM(divsi3_skip_div0_test): + eor ip, r0, r1 @ save the sign of the result. + do_it mi + rsbmi r1, r1, #0 @ loops below use unsigned. + subs r2, r1, #1 @ division by 1 or -1 ? + beq 10f + movs r3, r0 + do_it mi + rsbmi r3, r0, #0 @ positive dividend value + cmp r3, r1 + bls 11f + tst r1, r2 @ divisor is power of 2 ? + beq 12f + + ARM_DIV_BODY r3, r1, r0, r2 + + cmp ip, #0 + do_it mi + rsbmi r0, r0, #0 + RET + +10: teq ip, r0 @ same sign ? + do_it mi + rsbmi r0, r0, #0 + RET + +11: do_it lo + movlo r0, #0 + do_it eq,t + moveq r0, ip, asr #31 + orreq r0, r0, #1 + RET + +12: ARM_DIV2_ORDER r1, r2 + + cmp ip, #0 + mov r0, r3, lsr r2 + do_it mi + rsbmi r0, r0, #0 + RET + +#endif /* ARM version */ + + DIV_FUNC_END divsi3 signed + +#if defined(__prefer_thumb__) +FUNC_START aeabi_idivmod + cmp r1, #0 + beq LSYM(Ldiv0) + push {r0, r1, lr} + bl LSYM(divsi3_skip_div0_test) + POP {r1, r2, r3} + mul r2, r0 + sub r1, r1, r2 + bx r3 +#else +ARM_FUNC_START aeabi_idivmod + cmp r1, #0 + beq LSYM(Ldiv0) + stmfd sp!, { r0, r1, lr } + bl LSYM(divsi3_skip_div0_test) + ldmfd sp!, { r1, r2, lr } + mul r3, r2, r0 + sub r1, r1, r3 + RET +#endif + FUNC_END aeabi_idivmod + +#endif /* L_divsi3 */ +/* ------------------------------------------------------------------------ */ +#ifdef L_modsi3 + + FUNC_START modsi3 + +#ifdef __thumb__ + + mov curbit, #1 + cmp divisor, #0 + beq LSYM(Ldiv0) + bpl LSYM(Lover10) + neg divisor, divisor @ Loops below use unsigned. +LSYM(Lover10): + push { work } + @ Need to save the sign of the dividend, unfortunately, we need + @ work later on. Must do this after saving the original value of + @ the work register, because we will pop this value off first. + push { dividend } + cmp dividend, #0 + bpl LSYM(Lover11) + neg dividend, dividend +LSYM(Lover11): + cmp dividend, divisor + blo LSYM(Lgot_result) + + THUMB_DIV_MOD_BODY 1 + + pop { work } + cmp work, #0 + bpl LSYM(Lover12) + neg dividend, dividend +LSYM(Lover12): + pop { work } + RET + +#else /* ARM version. */ + + cmp r1, #0 + beq LSYM(Ldiv0) + rsbmi r1, r1, #0 @ loops below use unsigned. + movs ip, r0 @ preserve sign of dividend + rsbmi r0, r0, #0 @ if negative make positive + subs r2, r1, #1 @ compare divisor with 1 + cmpne r0, r1 @ compare dividend with divisor + moveq r0, #0 + tsthi r1, r2 @ see if divisor is power of 2 + andeq r0, r0, r2 + bls 10f + + ARM_MOD_BODY r0, r1, r2, r3 + +10: cmp ip, #0 + rsbmi r0, r0, #0 + RET + +#endif /* ARM version */ + + DIV_FUNC_END modsi3 signed + +#endif /* L_modsi3 */ +/* ------------------------------------------------------------------------ */ +#ifdef L_dvmd_tls + +#ifdef __ARM_EABI__ + WEAK aeabi_idiv0 + WEAK aeabi_ldiv0 + FUNC_START aeabi_idiv0 + FUNC_START aeabi_ldiv0 + RET + FUNC_END aeabi_ldiv0 + FUNC_END aeabi_idiv0 +#else + FUNC_START div0 + RET + FUNC_END div0 +#endif + +#endif /* L_divmodsi_tools */ +/* ------------------------------------------------------------------------ */ +#ifdef L_dvmd_lnx +@ GNU/Linux division-by zero handler. Used in place of L_dvmd_tls + +/* Constant taken from <asm/signal.h>. */ +#define SIGFPE 8 + +#ifdef __ARM_EABI__ + WEAK aeabi_idiv0 + WEAK aeabi_ldiv0 + ARM_FUNC_START aeabi_idiv0 + ARM_FUNC_START aeabi_ldiv0 +#else + ARM_FUNC_START div0 +#endif + + do_push {r1, lr} + mov r0, #SIGFPE + bl SYM(raise) __PLT__ + RETLDM r1 + +#ifdef __ARM_EABI__ + FUNC_END aeabi_ldiv0 + FUNC_END aeabi_idiv0 +#else + FUNC_END div0 +#endif + +#endif /* L_dvmd_lnx */ +#ifdef L_clear_cache +#if defined __ARM_EABI__ && defined __linux__ +@ EABI GNU/Linux call to cacheflush syscall. + ARM_FUNC_START clear_cache + do_push {r7} +#if __ARM_ARCH__ >= 7 || defined(__ARM_ARCH_6T2__) + movw r7, #2 + movt r7, #0xf +#else + mov r7, #0xf0000 + add r7, r7, #2 +#endif + mov r2, #0 + swi 0 + do_pop {r7} + RET + FUNC_END clear_cache +#else +#error "This is only for ARM EABI GNU/Linux" +#endif +#endif /* L_clear_cache */ +/* ------------------------------------------------------------------------ */ +/* Dword shift operations. */ +/* All the following Dword shift variants rely on the fact that + shft xxx, Reg + is in fact done as + shft xxx, (Reg & 255) + so for Reg value in (32...63) and (-1...-31) we will get zero (in the + case of logical shifts) or the sign (for asr). */ + +#ifdef __ARMEB__ +#define al r1 +#define ah r0 +#else +#define al r0 +#define ah r1 +#endif + +/* Prevent __aeabi double-word shifts from being produced on SymbianOS. */ +#ifndef __symbian__ + +#ifdef L_lshrdi3 + + FUNC_START lshrdi3 + FUNC_ALIAS aeabi_llsr lshrdi3 + +#ifdef __thumb__ + lsr al, r2 + mov r3, ah + lsr ah, r2 + mov ip, r3 + sub r2, #32 + lsr r3, r2 + orr al, r3 + neg r2, r2 + mov r3, ip + lsl r3, r2 + orr al, r3 + RET +#else + subs r3, r2, #32 + rsb ip, r2, #32 + movmi al, al, lsr r2 + movpl al, ah, lsr r3 + orrmi al, al, ah, lsl ip + mov ah, ah, lsr r2 + RET +#endif + FUNC_END aeabi_llsr + FUNC_END lshrdi3 + +#endif + +#ifdef L_ashrdi3 + + FUNC_START ashrdi3 + FUNC_ALIAS aeabi_lasr ashrdi3 + +#ifdef __thumb__ + lsr al, r2 + mov r3, ah + asr ah, r2 + sub r2, #32 + @ If r2 is negative at this point the following step would OR + @ the sign bit into all of AL. That's not what we want... + bmi 1f + mov ip, r3 + asr r3, r2 + orr al, r3 + mov r3, ip +1: + neg r2, r2 + lsl r3, r2 + orr al, r3 + RET +#else + subs r3, r2, #32 + rsb ip, r2, #32 + movmi al, al, lsr r2 + movpl al, ah, asr r3 + orrmi al, al, ah, lsl ip + mov ah, ah, asr r2 + RET +#endif + + FUNC_END aeabi_lasr + FUNC_END ashrdi3 + +#endif + +#ifdef L_ashldi3 + + FUNC_START ashldi3 + FUNC_ALIAS aeabi_llsl ashldi3 + +#ifdef __thumb__ + lsl ah, r2 + mov r3, al + lsl al, r2 + mov ip, r3 + sub r2, #32 + lsl r3, r2 + orr ah, r3 + neg r2, r2 + mov r3, ip + lsr r3, r2 + orr ah, r3 + RET +#else + subs r3, r2, #32 + rsb ip, r2, #32 + movmi ah, ah, lsl r2 + movpl ah, al, lsl r3 + orrmi ah, ah, al, lsr ip + mov al, al, lsl r2 + RET +#endif + FUNC_END aeabi_llsl + FUNC_END ashldi3 + +#endif + +#endif /* __symbian__ */ + +#if ((__ARM_ARCH__ > 5) && !defined(__ARM_ARCH_6M__)) \ + || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \ + || defined(__ARM_ARCH_5TEJ__) +#define HAVE_ARM_CLZ 1 +#endif + +#ifdef L_clzsi2 +#if defined(__ARM_ARCH_6M__) +FUNC_START clzsi2 + mov r1, #28 + mov r3, #1 + lsl r3, r3, #16 + cmp r0, r3 /* 0x10000 */ + bcc 2f + lsr r0, r0, #16 + sub r1, r1, #16 +2: lsr r3, r3, #8 + cmp r0, r3 /* #0x100 */ + bcc 2f + lsr r0, r0, #8 + sub r1, r1, #8 +2: lsr r3, r3, #4 + cmp r0, r3 /* #0x10 */ + bcc 2f + lsr r0, r0, #4 + sub r1, r1, #4 +2: adr r2, 1f + ldrb r0, [r2, r0] + add r0, r0, r1 + bx lr +.align 2 +1: +.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 + FUNC_END clzsi2 +#else +ARM_FUNC_START clzsi2 +# if defined(HAVE_ARM_CLZ) + clz r0, r0 + RET +# else + mov r1, #28 + cmp r0, #0x10000 + do_it cs, t + movcs r0, r0, lsr #16 + subcs r1, r1, #16 + cmp r0, #0x100 + do_it cs, t + movcs r0, r0, lsr #8 + subcs r1, r1, #8 + cmp r0, #0x10 + do_it cs, t + movcs r0, r0, lsr #4 + subcs r1, r1, #4 + adr r2, 1f + ldrb r0, [r2, r0] + add r0, r0, r1 + RET +.align 2 +1: +.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 +# endif /* !HAVE_ARM_CLZ */ + FUNC_END clzsi2 +#endif +#endif /* L_clzsi2 */ + +#ifdef L_clzdi2 +#if !defined(HAVE_ARM_CLZ) + +# if defined(__ARM_ARCH_6M__) +FUNC_START clzdi2 + push {r4, lr} +# else +ARM_FUNC_START clzdi2 + do_push {r4, lr} +# endif + cmp xxh, #0 + bne 1f +# ifdef __ARMEB__ + mov r0, xxl + bl __clzsi2 + add r0, r0, #32 + b 2f +1: + bl __clzsi2 +# else + bl __clzsi2 + add r0, r0, #32 + b 2f +1: + mov r0, xxh + bl __clzsi2 +# endif +2: +# if defined(__ARM_ARCH_6M__) + pop {r4, pc} +# else + RETLDM r4 +# endif + FUNC_END clzdi2 + +#else /* HAVE_ARM_CLZ */ + +ARM_FUNC_START clzdi2 + cmp xxh, #0 + do_it eq, et + clzeq r0, xxl + clzne r0, xxh + addeq r0, r0, #32 + RET + FUNC_END clzdi2 + +#endif +#endif /* L_clzdi2 */ + +/* ------------------------------------------------------------------------ */ +/* These next two sections are here despite the fact that they contain Thumb + assembler because their presence allows interworked code to be linked even + when the GCC library is this one. */ + +/* Do not build the interworking functions when the target architecture does + not support Thumb instructions. (This can be a multilib option). */ +#if defined __ARM_ARCH_4T__ || defined __ARM_ARCH_5T__\ + || defined __ARM_ARCH_5TE__ || defined __ARM_ARCH_5TEJ__ \ + || __ARM_ARCH__ >= 6 + +#if defined L_call_via_rX + +/* These labels & instructions are used by the Arm/Thumb interworking code. + The address of function to be called is loaded into a register and then + one of these labels is called via a BL instruction. This puts the + return address into the link register with the bottom bit set, and the + code here switches to the correct mode before executing the function. */ + + .text + .align 0 + .force_thumb + +.macro call_via register + THUMB_FUNC_START _call_via_\register + + bx \register + nop + + SIZE (_call_via_\register) +.endm + + call_via r0 + call_via r1 + call_via r2 + call_via r3 + call_via r4 + call_via r5 + call_via r6 + call_via r7 + call_via r8 + call_via r9 + call_via sl + call_via fp + call_via ip + call_via sp + call_via lr + +#endif /* L_call_via_rX */ + +/* Don't bother with the old interworking routines for Thumb-2. */ +/* ??? Maybe only omit these on "m" variants. */ +#if !defined(__thumb2__) && !defined(__ARM_ARCH_6M__) + +#if defined L_interwork_call_via_rX + +/* These labels & instructions are used by the Arm/Thumb interworking code, + when the target address is in an unknown instruction set. The address + of function to be called is loaded into a register and then one of these + labels is called via a BL instruction. This puts the return address + into the link register with the bottom bit set, and the code here + switches to the correct mode before executing the function. Unfortunately + the target code cannot be relied upon to return via a BX instruction, so + instead we have to store the resturn address on the stack and allow the + called function to return here instead. Upon return we recover the real + return address and use a BX to get back to Thumb mode. + + There are three variations of this code. The first, + _interwork_call_via_rN(), will push the return address onto the + stack and pop it in _arm_return(). It should only be used if all + arguments are passed in registers. + + The second, _interwork_r7_call_via_rN(), instead stores the return + address at [r7, #-4]. It is the caller's responsibility to ensure + that this address is valid and contains no useful data. + + The third, _interwork_r11_call_via_rN(), works in the same way but + uses r11 instead of r7. It is useful if the caller does not really + need a frame pointer. */ + + .text + .align 0 + + .code 32 + .globl _arm_return +LSYM(Lstart_arm_return): + cfi_start LSYM(Lstart_arm_return) LSYM(Lend_arm_return) + cfi_push 0, 0xe, -0x8, 0x8 + nop @ This nop is for the benefit of debuggers, so that + @ backtraces will use the correct unwind information. +_arm_return: + RETLDM unwind=LSYM(Lstart_arm_return) + cfi_end LSYM(Lend_arm_return) + + .globl _arm_return_r7 +_arm_return_r7: + ldr lr, [r7, #-4] + bx lr + + .globl _arm_return_r11 +_arm_return_r11: + ldr lr, [r11, #-4] + bx lr + +.macro interwork_with_frame frame, register, name, return + .code 16 + + THUMB_FUNC_START \name + + bx pc + nop + + .code 32 + tst \register, #1 + streq lr, [\frame, #-4] + adreq lr, _arm_return_\frame + bx \register + + SIZE (\name) +.endm + +.macro interwork register + .code 16 + + THUMB_FUNC_START _interwork_call_via_\register + + bx pc + nop + + .code 32 + .globl LSYM(Lchange_\register) +LSYM(Lchange_\register): + tst \register, #1 + streq lr, [sp, #-8]! + adreq lr, _arm_return + bx \register + + SIZE (_interwork_call_via_\register) + + interwork_with_frame r7,\register,_interwork_r7_call_via_\register + interwork_with_frame r11,\register,_interwork_r11_call_via_\register +.endm + + interwork r0 + interwork r1 + interwork r2 + interwork r3 + interwork r4 + interwork r5 + interwork r6 + interwork r7 + interwork r8 + interwork r9 + interwork sl + interwork fp + interwork ip + interwork sp + + /* The LR case has to be handled a little differently... */ + .code 16 + + THUMB_FUNC_START _interwork_call_via_lr + + bx pc + nop + + .code 32 + .globl .Lchange_lr +.Lchange_lr: + tst lr, #1 + stmeqdb r13!, {lr, pc} + mov ip, lr + adreq lr, _arm_return + bx ip + + SIZE (_interwork_call_via_lr) + +#endif /* L_interwork_call_via_rX */ +#endif /* !__thumb2__ */ + +/* Functions to support compact pic switch tables in thumb1 state. + All these routines take an index into the table in r0. The + table is at LR & ~1 (but this must be rounded up in the case + of 32-bit entires). They are only permitted to clobber r12 + and r14 and r0 must be preserved on exit. */ +#ifdef L_thumb1_case_sqi + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_sqi + push {r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r1, r1, #1 + ldrsb r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r1} + bx lr + SIZE (__gnu_thumb1_case_sqi) +#endif + +#ifdef L_thumb1_case_uqi + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_uqi + push {r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r1, r1, #1 + ldrb r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r1} + bx lr + SIZE (__gnu_thumb1_case_uqi) +#endif + +#ifdef L_thumb1_case_shi + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_shi + push {r0, r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r0, r0, #1 + lsls r1, r1, #1 + ldrsh r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r0, r1} + bx lr + SIZE (__gnu_thumb1_case_shi) +#endif + +#ifdef L_thumb1_case_uhi + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_uhi + push {r0, r1} + mov r1, lr + lsrs r1, r1, #1 + lsls r0, r0, #1 + lsls r1, r1, #1 + ldrh r1, [r1, r0] + lsls r1, r1, #1 + add lr, lr, r1 + pop {r0, r1} + bx lr + SIZE (__gnu_thumb1_case_uhi) +#endif + +#ifdef L_thumb1_case_si + + .text + .align 0 + .force_thumb + .syntax unified + THUMB_FUNC_START __gnu_thumb1_case_si + push {r0, r1} + mov r1, lr + adds.n r1, r1, #2 /* Align to word. */ + lsrs r1, r1, #2 + lsls r0, r0, #2 + lsls r1, r1, #2 + ldr r0, [r1, r0] + adds r0, r0, r1 + mov lr, r0 + pop {r0, r1} + mov pc, lr /* We know we were called from thumb code. */ + SIZE (__gnu_thumb1_case_si) +#endif + +#endif /* Arch supports thumb. */ + +#ifndef __symbian__ +#ifndef __ARM_ARCH_6M__ +#include "ieee754-df.S" +#include "ieee754-sf.S" +#include "bpabi.S" +#else /* __ARM_ARCH_6M__ */ +#include "bpabi-v6m.S" +#endif /* __ARM_ARCH_6M__ */ +#endif /* !__symbian__ */ diff --git a/libgcc/config/arm/libunwind.S b/libgcc/config/arm/libunwind.S index a3a19daab4b..8166cd86e47 100644 --- a/libgcc/config/arm/libunwind.S +++ b/libgcc/config/arm/libunwind.S @@ -40,7 +40,7 @@ #ifndef __symbian__ -#include "config/arm/lib1funcs.asm" +#include "lib1funcs.S" .macro UNPREFIX name .global SYM (\name) diff --git a/libgcc/config/arm/t-arm b/libgcc/config/arm/t-arm new file mode 100644 index 00000000000..4e17e99b4a5 --- /dev/null +++ b/libgcc/config/arm/t-arm @@ -0,0 +1,3 @@ +LIB1ASMSRC = arm/lib1funcs.S +LIB1ASMFUNCS = _thumb1_case_sqi _thumb1_case_uqi _thumb1_case_shi \ + _thumb1_case_uhi _thumb1_case_si diff --git a/libgcc/config/arm/t-bpabi b/libgcc/config/arm/t-bpabi index ebb2f9fd85d..8787285ab1f 100644 --- a/libgcc/config/arm/t-bpabi +++ b/libgcc/config/arm/t-bpabi @@ -1,3 +1,6 @@ +# Add the bpabi.S functions. +LIB1ASMFUNCS += _aeabi_lcmp _aeabi_ulcmp _aeabi_ldivmod _aeabi_uldivmod + LIB2ADDEH = $(srcdir)/config/arm/unwind-arm.c \ $(srcdir)/config/arm/libunwind.S \ $(srcdir)/config/arm/pr-support.c $(srcdir)/unwind-c.c diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf new file mode 100644 index 00000000000..fab32e445be --- /dev/null +++ b/libgcc/config/arm/t-elf @@ -0,0 +1,13 @@ +# For most CPUs we have an assembly soft-float implementations. +# However this is not true for ARMv6M. Here we want to use the soft-fp C +# implementation. The soft-fp code is only build for ARMv6M. This pulls +# in the asm implementation for other CPUs. +LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func \ + _call_via_rX _interwork_call_via_rX \ + _lshrdi3 _ashrdi3 _ashldi3 \ + _arm_negdf2 _arm_addsubdf3 _arm_muldivdf3 _arm_cmpdf2 _arm_unorddf2 \ + _arm_fixdfsi _arm_fixunsdfsi \ + _arm_truncdfsf2 _arm_negsf2 _arm_addsubsf3 _arm_muldivsf3 \ + _arm_cmpsf2 _arm_unordsf2 _arm_fixsfsi _arm_fixunssfsi \ + _arm_floatdidf _arm_floatdisf _arm_floatundidf _arm_floatundisf \ + _clzsi2 _clzdi2 diff --git a/libgcc/config/arm/t-linux b/libgcc/config/arm/t-linux new file mode 100644 index 00000000000..a154f775a0f --- /dev/null +++ b/libgcc/config/arm/t-linux @@ -0,0 +1,3 @@ +LIB1ASMSRC = arm/lib1funcs.S +LIB1ASMFUNCS = _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_lnx _clzsi2 _clzdi2 \ + _arm_addsubdf3 _arm_addsubsf3 diff --git a/libgcc/config/arm/t-linux-eabi b/libgcc/config/arm/t-linux-eabi new file mode 100644 index 00000000000..dfc9197ea45 --- /dev/null +++ b/libgcc/config/arm/t-linux-eabi @@ -0,0 +1,2 @@ +# Use a version of div0 which raises SIGFPE, and a special __clear_cache. +LIB1ASMFUNCS := $(filter-out _dvmd_tls,$(LIB1ASMFUNCS)) _dvmd_lnx _clear_cache diff --git a/libgcc/config/arm/t-strongarm-elf b/libgcc/config/arm/t-strongarm-elf new file mode 100644 index 00000000000..cd9f9667ddf --- /dev/null +++ b/libgcc/config/arm/t-strongarm-elf @@ -0,0 +1 @@ +LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _clzsi2 _clzdi2 diff --git a/libgcc/config/arm/t-symbian b/libgcc/config/arm/t-symbian index 6788d5f40b3..1989696c8a3 100644 --- a/libgcc/config/arm/t-symbian +++ b/libgcc/config/arm/t-symbian @@ -1,2 +1,16 @@ +LIB1ASMFUNCS += _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 + +# These functions have __aeabi equivalents and will never be called by GCC. +# By putting them in LIB1ASMFUNCS, we avoid the standard libgcc2.c code being +# used -- and we make sure that definitions are not available in lib1funcs.S, +# either, so they end up undefined. +LIB1ASMFUNCS += \ + _ashldi3 _ashrdi3 _divdi3 _floatdidf _udivmoddi4 _umoddi3 \ + _udivdi3 _lshrdi3 _moddi3 _muldi3 _negdi2 _cmpdi2 \ + _fixdfdi _fixsfdi _fixunsdfdi _fixunssfdi _floatdisf \ + _negdf2 _addsubdf3 _muldivdf3 _cmpdf2 _unorddf2 _fixdfsi _fixunsdfsi \ + _truncdfsf2 _negsf2 _addsubsf3 _muldivsf3 _cmpsf2 _unordsf2 \ + _fixsfsi _fixunssfsi + # Include the gcc personality routine LIB2ADDEH = $(srcdir)/unwind-c.c $(srcdir)/config/arm/pr-support.c diff --git a/libgcc/config/arm/t-vxworks b/libgcc/config/arm/t-vxworks new file mode 100644 index 00000000000..70ccdc1556a --- /dev/null +++ b/libgcc/config/arm/t-vxworks @@ -0,0 +1 @@ +LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 diff --git a/libgcc/config/arm/t-wince-pe b/libgcc/config/arm/t-wince-pe new file mode 100644 index 00000000000..33ea969ccf4 --- /dev/null +++ b/libgcc/config/arm/t-wince-pe @@ -0,0 +1 @@ +LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2 diff --git a/libgcc/config/avr/lib1funcs.S b/libgcc/config/avr/lib1funcs.S new file mode 100644 index 00000000000..8c369c96a77 --- /dev/null +++ b/libgcc/config/avr/lib1funcs.S @@ -0,0 +1,1533 @@ +/* -*- Mode: Asm -*- */ +/* Copyright (C) 1998, 1999, 2000, 2007, 2008, 2009 + Free Software Foundation, Inc. + Contributed by Denis Chertykov <chertykov@gmail.com> + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#define __zero_reg__ r1 +#define __tmp_reg__ r0 +#define __SREG__ 0x3f +#define __SP_H__ 0x3e +#define __SP_L__ 0x3d +#define __RAMPZ__ 0x3B +#define __EIND__ 0x3C + +/* Most of the functions here are called directly from avr.md + patterns, instead of using the standard libcall mechanisms. + This can make better code because GCC knows exactly which + of the call-used registers (not all of them) are clobbered. */ + +/* FIXME: At present, there is no SORT directive in the linker + script so that we must not assume that different modules + in the same input section like .libgcc.text.mul will be + located close together. Therefore, we cannot use + RCALL/RJMP to call a function like __udivmodhi4 from + __divmodhi4 and have to use lengthy XCALL/XJMP even + though they are in the same input section and all same + input sections together are small enough to reach every + location with a RCALL/RJMP instruction. */ + + .macro mov_l r_dest, r_src +#if defined (__AVR_HAVE_MOVW__) + movw \r_dest, \r_src +#else + mov \r_dest, \r_src +#endif + .endm + + .macro mov_h r_dest, r_src +#if defined (__AVR_HAVE_MOVW__) + ; empty +#else + mov \r_dest, \r_src +#endif + .endm + +#if defined (__AVR_HAVE_JMP_CALL__) +#define XCALL call +#define XJMP jmp +#else +#define XCALL rcall +#define XJMP rjmp +#endif + +.macro DEFUN name +.global \name +.func \name +\name: +.endm + +.macro ENDF name +.size \name, .-\name +.endfunc +.endm + + +.section .text.libgcc.mul, "ax", @progbits + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +/* Note: mulqi3, mulhi3 are open-coded on the enhanced core. */ +#if !defined (__AVR_HAVE_MUL__) +/******************************************************* + Multiplication 8 x 8 without MUL +*******************************************************/ +#if defined (L_mulqi3) + +#define r_arg2 r22 /* multiplicand */ +#define r_arg1 r24 /* multiplier */ +#define r_res __tmp_reg__ /* result */ + +DEFUN __mulqi3 + clr r_res ; clear result +__mulqi3_loop: + sbrc r_arg1,0 + add r_res,r_arg2 + add r_arg2,r_arg2 ; shift multiplicand + breq __mulqi3_exit ; while multiplicand != 0 + lsr r_arg1 ; + brne __mulqi3_loop ; exit if multiplier = 0 +__mulqi3_exit: + mov r_arg1,r_res ; result to return register + ret +ENDF __mulqi3 + +#undef r_arg2 +#undef r_arg1 +#undef r_res + +#endif /* defined (L_mulqi3) */ + +#if defined (L_mulqihi3) +DEFUN __mulqihi3 + clr r25 + sbrc r24, 7 + dec r25 + clr r23 + sbrc r22, 7 + dec r22 + XJMP __mulhi3 +ENDF __mulqihi3: +#endif /* defined (L_mulqihi3) */ + +#if defined (L_umulqihi3) +DEFUN __umulqihi3 + clr r25 + clr r23 + XJMP __mulhi3 +ENDF __umulqihi3 +#endif /* defined (L_umulqihi3) */ + +/******************************************************* + Multiplication 16 x 16 without MUL +*******************************************************/ +#if defined (L_mulhi3) +#define r_arg1L r24 /* multiplier Low */ +#define r_arg1H r25 /* multiplier High */ +#define r_arg2L r22 /* multiplicand Low */ +#define r_arg2H r23 /* multiplicand High */ +#define r_resL __tmp_reg__ /* result Low */ +#define r_resH r21 /* result High */ + +DEFUN __mulhi3 + clr r_resH ; clear result + clr r_resL ; clear result +__mulhi3_loop: + sbrs r_arg1L,0 + rjmp __mulhi3_skip1 + add r_resL,r_arg2L ; result + multiplicand + adc r_resH,r_arg2H +__mulhi3_skip1: + add r_arg2L,r_arg2L ; shift multiplicand + adc r_arg2H,r_arg2H + + cp r_arg2L,__zero_reg__ + cpc r_arg2H,__zero_reg__ + breq __mulhi3_exit ; while multiplicand != 0 + + lsr r_arg1H ; gets LSB of multiplier + ror r_arg1L + sbiw r_arg1L,0 + brne __mulhi3_loop ; exit if multiplier = 0 +__mulhi3_exit: + mov r_arg1H,r_resH ; result to return register + mov r_arg1L,r_resL + ret +ENDF __mulhi3 + +#undef r_arg1L +#undef r_arg1H +#undef r_arg2L +#undef r_arg2H +#undef r_resL +#undef r_resH + +#endif /* defined (L_mulhi3) */ + +/******************************************************* + Widening Multiplication 32 = 16 x 16 without MUL +*******************************************************/ + +#if defined (L_mulhisi3) +DEFUN __mulhisi3 +;;; FIXME: This is dead code (noone calls it) + mov_l r18, r24 + mov_h r19, r25 + clr r24 + sbrc r23, 7 + dec r24 + mov r25, r24 + clr r20 + sbrc r19, 7 + dec r20 + mov r21, r20 + XJMP __mulsi3 +ENDF __mulhisi3 +#endif /* defined (L_mulhisi3) */ + +#if defined (L_umulhisi3) +DEFUN __umulhisi3 +;;; FIXME: This is dead code (noone calls it) + mov_l r18, r24 + mov_h r19, r25 + clr r24 + clr r25 + mov_l r20, r24 + mov_h r21, r25 + XJMP __mulsi3 +ENDF __umulhisi3 +#endif /* defined (L_umulhisi3) */ + +#if defined (L_mulsi3) +/******************************************************* + Multiplication 32 x 32 without MUL +*******************************************************/ +#define r_arg1L r22 /* multiplier Low */ +#define r_arg1H r23 +#define r_arg1HL r24 +#define r_arg1HH r25 /* multiplier High */ + +#define r_arg2L r18 /* multiplicand Low */ +#define r_arg2H r19 +#define r_arg2HL r20 +#define r_arg2HH r21 /* multiplicand High */ + +#define r_resL r26 /* result Low */ +#define r_resH r27 +#define r_resHL r30 +#define r_resHH r31 /* result High */ + +DEFUN __mulsi3 + clr r_resHH ; clear result + clr r_resHL ; clear result + clr r_resH ; clear result + clr r_resL ; clear result +__mulsi3_loop: + sbrs r_arg1L,0 + rjmp __mulsi3_skip1 + add r_resL,r_arg2L ; result + multiplicand + adc r_resH,r_arg2H + adc r_resHL,r_arg2HL + adc r_resHH,r_arg2HH +__mulsi3_skip1: + add r_arg2L,r_arg2L ; shift multiplicand + adc r_arg2H,r_arg2H + adc r_arg2HL,r_arg2HL + adc r_arg2HH,r_arg2HH + + lsr r_arg1HH ; gets LSB of multiplier + ror r_arg1HL + ror r_arg1H + ror r_arg1L + brne __mulsi3_loop + sbiw r_arg1HL,0 + cpc r_arg1H,r_arg1L + brne __mulsi3_loop ; exit if multiplier = 0 +__mulsi3_exit: + mov_h r_arg1HH,r_resHH ; result to return register + mov_l r_arg1HL,r_resHL + mov_h r_arg1H,r_resH + mov_l r_arg1L,r_resL + ret +ENDF __mulsi3 + +#undef r_arg1L +#undef r_arg1H +#undef r_arg1HL +#undef r_arg1HH + +#undef r_arg2L +#undef r_arg2H +#undef r_arg2HL +#undef r_arg2HH + +#undef r_resL +#undef r_resH +#undef r_resHL +#undef r_resHH + +#endif /* defined (L_mulsi3) */ + +#endif /* !defined (__AVR_HAVE_MUL__) */ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +#if defined (__AVR_HAVE_MUL__) +#define A0 26 +#define B0 18 +#define C0 22 + +#define A1 A0+1 + +#define B1 B0+1 +#define B2 B0+2 +#define B3 B0+3 + +#define C1 C0+1 +#define C2 C0+2 +#define C3 C0+3 + +/******************************************************* + Widening Multiplication 32 = 16 x 16 +*******************************************************/ + +#if defined (L_mulhisi3) +;;; R25:R22 = (signed long) R27:R26 * (signed long) R19:R18 +;;; C3:C0 = (signed long) A1:A0 * (signed long) B1:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __mulhisi3 + XCALL __umulhisi3 + ;; Sign-extend B + tst B1 + brpl 1f + sub C2, A0 + sbc C3, A1 +1: ;; Sign-extend A + XJMP __usmulhisi3_tail +ENDF __mulhisi3 +#endif /* L_mulhisi3 */ + +#if defined (L_usmulhisi3) +;;; R25:R22 = (signed long) R27:R26 * (unsigned long) R19:R18 +;;; C3:C0 = (signed long) A1:A0 * (unsigned long) B1:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __usmulhisi3 + XCALL __umulhisi3 + ;; FALLTHRU +ENDF __usmulhisi3 + +DEFUN __usmulhisi3_tail + ;; Sign-extend A + sbrs A1, 7 + ret + sub C2, B0 + sbc C3, B1 + ret +ENDF __usmulhisi3_tail +#endif /* L_usmulhisi3 */ + +#if defined (L_umulhisi3) +;;; R25:R22 = (unsigned long) R27:R26 * (unsigned long) R19:R18 +;;; C3:C0 = (unsigned long) A1:A0 * (unsigned long) B1:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __umulhisi3 + mul A0, B0 + movw C0, r0 + mul A1, B1 + movw C2, r0 + mul A0, B1 + rcall 1f + mul A1, B0 +1: add C1, r0 + adc C2, r1 + clr __zero_reg__ + adc C3, __zero_reg__ + ret +ENDF __umulhisi3 +#endif /* L_umulhisi3 */ + +/******************************************************* + Widening Multiplication 32 = 16 x 32 +*******************************************************/ + +#if defined (L_mulshisi3) +;;; R25:R22 = (signed long) R27:R26 * R21:R18 +;;; (C3:C0) = (signed long) A1:A0 * B3:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __mulshisi3 +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + ;; Some cores have problem skipping 2-word instruction + tst A1 + brmi __mulohisi3 +#else + sbrs A1, 7 +#endif /* __AVR_HAVE_JMP_CALL__ */ + XJMP __muluhisi3 + ;; FALLTHRU +ENDF __mulshisi3 + +;;; R25:R22 = (one-extended long) R27:R26 * R21:R18 +;;; (C3:C0) = (one-extended long) A1:A0 * B3:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __mulohisi3 + XCALL __muluhisi3 + ;; One-extend R27:R26 (A1:A0) + sub C2, B0 + sbc C3, B1 + ret +ENDF __mulohisi3 +#endif /* L_mulshisi3 */ + +#if defined (L_muluhisi3) +;;; R25:R22 = (unsigned long) R27:R26 * R21:R18 +;;; (C3:C0) = (unsigned long) A1:A0 * B3:B0 +;;; Clobbers: __tmp_reg__ +DEFUN __muluhisi3 + XCALL __umulhisi3 + mul A0, B3 + add C3, r0 + mul A1, B2 + add C3, r0 + mul A0, B2 + add C2, r0 + adc C3, r1 + clr __zero_reg__ + ret +ENDF __muluhisi3 +#endif /* L_muluhisi3 */ + +/******************************************************* + Multiplication 32 x 32 +*******************************************************/ + +#if defined (L_mulsi3) +;;; R25:R22 = R25:R22 * R21:R18 +;;; (C3:C0) = C3:C0 * B3:B0 +;;; Clobbers: R26, R27, __tmp_reg__ +DEFUN __mulsi3 + movw A0, C0 + push C2 + push C3 + XCALL __muluhisi3 + pop A1 + pop A0 + ;; A1:A0 now contains the high word of A + mul A0, B0 + add C2, r0 + adc C3, r1 + mul A0, B1 + add C3, r0 + mul A1, B0 + add C3, r0 + clr __zero_reg__ + ret +ENDF __mulsi3 +#endif /* L_mulsi3 */ + +#undef A0 +#undef A1 + +#undef B0 +#undef B1 +#undef B2 +#undef B3 + +#undef C0 +#undef C1 +#undef C2 +#undef C3 + +#endif /* __AVR_HAVE_MUL__ */ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +.section .text.libgcc.div, "ax", @progbits + +/******************************************************* + Division 8 / 8 => (result + remainder) +*******************************************************/ +#define r_rem r25 /* remainder */ +#define r_arg1 r24 /* dividend, quotient */ +#define r_arg2 r22 /* divisor */ +#define r_cnt r23 /* loop count */ + +#if defined (L_udivmodqi4) +DEFUN __udivmodqi4 + sub r_rem,r_rem ; clear remainder and carry + ldi r_cnt,9 ; init loop counter + rjmp __udivmodqi4_ep ; jump to entry point +__udivmodqi4_loop: + rol r_rem ; shift dividend into remainder + cp r_rem,r_arg2 ; compare remainder & divisor + brcs __udivmodqi4_ep ; remainder <= divisor + sub r_rem,r_arg2 ; restore remainder +__udivmodqi4_ep: + rol r_arg1 ; shift dividend (with CARRY) + dec r_cnt ; decrement loop counter + brne __udivmodqi4_loop + com r_arg1 ; complement result + ; because C flag was complemented in loop + ret +ENDF __udivmodqi4 +#endif /* defined (L_udivmodqi4) */ + +#if defined (L_divmodqi4) +DEFUN __divmodqi4 + bst r_arg1,7 ; store sign of dividend + mov __tmp_reg__,r_arg1 + eor __tmp_reg__,r_arg2; r0.7 is sign of result + sbrc r_arg1,7 + neg r_arg1 ; dividend negative : negate + sbrc r_arg2,7 + neg r_arg2 ; divisor negative : negate + XCALL __udivmodqi4 ; do the unsigned div/mod + brtc __divmodqi4_1 + neg r_rem ; correct remainder sign +__divmodqi4_1: + sbrc __tmp_reg__,7 + neg r_arg1 ; correct result sign +__divmodqi4_exit: + ret +ENDF __divmodqi4 +#endif /* defined (L_divmodqi4) */ + +#undef r_rem +#undef r_arg1 +#undef r_arg2 +#undef r_cnt + + +/******************************************************* + Division 16 / 16 => (result + remainder) +*******************************************************/ +#define r_remL r26 /* remainder Low */ +#define r_remH r27 /* remainder High */ + +/* return: remainder */ +#define r_arg1L r24 /* dividend Low */ +#define r_arg1H r25 /* dividend High */ + +/* return: quotient */ +#define r_arg2L r22 /* divisor Low */ +#define r_arg2H r23 /* divisor High */ + +#define r_cnt r21 /* loop count */ + +#if defined (L_udivmodhi4) +DEFUN __udivmodhi4 + sub r_remL,r_remL + sub r_remH,r_remH ; clear remainder and carry + ldi r_cnt,17 ; init loop counter + rjmp __udivmodhi4_ep ; jump to entry point +__udivmodhi4_loop: + rol r_remL ; shift dividend into remainder + rol r_remH + cp r_remL,r_arg2L ; compare remainder & divisor + cpc r_remH,r_arg2H + brcs __udivmodhi4_ep ; remainder < divisor + sub r_remL,r_arg2L ; restore remainder + sbc r_remH,r_arg2H +__udivmodhi4_ep: + rol r_arg1L ; shift dividend (with CARRY) + rol r_arg1H + dec r_cnt ; decrement loop counter + brne __udivmodhi4_loop + com r_arg1L + com r_arg1H +; div/mod results to return registers, as for the div() function + mov_l r_arg2L, r_arg1L ; quotient + mov_h r_arg2H, r_arg1H + mov_l r_arg1L, r_remL ; remainder + mov_h r_arg1H, r_remH + ret +ENDF __udivmodhi4 +#endif /* defined (L_udivmodhi4) */ + +#if defined (L_divmodhi4) +DEFUN __divmodhi4 + .global _div +_div: + bst r_arg1H,7 ; store sign of dividend + mov __tmp_reg__,r_arg1H + eor __tmp_reg__,r_arg2H ; r0.7 is sign of result + rcall __divmodhi4_neg1 ; dividend negative : negate + sbrc r_arg2H,7 + rcall __divmodhi4_neg2 ; divisor negative : negate + XCALL __udivmodhi4 ; do the unsigned div/mod + rcall __divmodhi4_neg1 ; correct remainder sign + tst __tmp_reg__ + brpl __divmodhi4_exit +__divmodhi4_neg2: + com r_arg2H + neg r_arg2L ; correct divisor/result sign + sbci r_arg2H,0xff +__divmodhi4_exit: + ret +__divmodhi4_neg1: + brtc __divmodhi4_exit + com r_arg1H + neg r_arg1L ; correct dividend/remainder sign + sbci r_arg1H,0xff + ret +ENDF __divmodhi4 +#endif /* defined (L_divmodhi4) */ + +#undef r_remH +#undef r_remL + +#undef r_arg1H +#undef r_arg1L + +#undef r_arg2H +#undef r_arg2L + +#undef r_cnt + +/******************************************************* + Division 32 / 32 => (result + remainder) +*******************************************************/ +#define r_remHH r31 /* remainder High */ +#define r_remHL r30 +#define r_remH r27 +#define r_remL r26 /* remainder Low */ + +/* return: remainder */ +#define r_arg1HH r25 /* dividend High */ +#define r_arg1HL r24 +#define r_arg1H r23 +#define r_arg1L r22 /* dividend Low */ + +/* return: quotient */ +#define r_arg2HH r21 /* divisor High */ +#define r_arg2HL r20 +#define r_arg2H r19 +#define r_arg2L r18 /* divisor Low */ + +#define r_cnt __zero_reg__ /* loop count (0 after the loop!) */ + +#if defined (L_udivmodsi4) +DEFUN __udivmodsi4 + ldi r_remL, 33 ; init loop counter + mov r_cnt, r_remL + sub r_remL,r_remL + sub r_remH,r_remH ; clear remainder and carry + mov_l r_remHL, r_remL + mov_h r_remHH, r_remH + rjmp __udivmodsi4_ep ; jump to entry point +__udivmodsi4_loop: + rol r_remL ; shift dividend into remainder + rol r_remH + rol r_remHL + rol r_remHH + cp r_remL,r_arg2L ; compare remainder & divisor + cpc r_remH,r_arg2H + cpc r_remHL,r_arg2HL + cpc r_remHH,r_arg2HH + brcs __udivmodsi4_ep ; remainder <= divisor + sub r_remL,r_arg2L ; restore remainder + sbc r_remH,r_arg2H + sbc r_remHL,r_arg2HL + sbc r_remHH,r_arg2HH +__udivmodsi4_ep: + rol r_arg1L ; shift dividend (with CARRY) + rol r_arg1H + rol r_arg1HL + rol r_arg1HH + dec r_cnt ; decrement loop counter + brne __udivmodsi4_loop + ; __zero_reg__ now restored (r_cnt == 0) + com r_arg1L + com r_arg1H + com r_arg1HL + com r_arg1HH +; div/mod results to return registers, as for the ldiv() function + mov_l r_arg2L, r_arg1L ; quotient + mov_h r_arg2H, r_arg1H + mov_l r_arg2HL, r_arg1HL + mov_h r_arg2HH, r_arg1HH + mov_l r_arg1L, r_remL ; remainder + mov_h r_arg1H, r_remH + mov_l r_arg1HL, r_remHL + mov_h r_arg1HH, r_remHH + ret +ENDF __udivmodsi4 +#endif /* defined (L_udivmodsi4) */ + +#if defined (L_divmodsi4) +DEFUN __divmodsi4 + bst r_arg1HH,7 ; store sign of dividend + mov __tmp_reg__,r_arg1HH + eor __tmp_reg__,r_arg2HH ; r0.7 is sign of result + rcall __divmodsi4_neg1 ; dividend negative : negate + sbrc r_arg2HH,7 + rcall __divmodsi4_neg2 ; divisor negative : negate + XCALL __udivmodsi4 ; do the unsigned div/mod + rcall __divmodsi4_neg1 ; correct remainder sign + rol __tmp_reg__ + brcc __divmodsi4_exit +__divmodsi4_neg2: + com r_arg2HH + com r_arg2HL + com r_arg2H + neg r_arg2L ; correct divisor/quotient sign + sbci r_arg2H,0xff + sbci r_arg2HL,0xff + sbci r_arg2HH,0xff +__divmodsi4_exit: + ret +__divmodsi4_neg1: + brtc __divmodsi4_exit + com r_arg1HH + com r_arg1HL + com r_arg1H + neg r_arg1L ; correct dividend/remainder sign + sbci r_arg1H, 0xff + sbci r_arg1HL,0xff + sbci r_arg1HH,0xff + ret +ENDF __divmodsi4 +#endif /* defined (L_divmodsi4) */ + + +.section .text.libgcc.prologue, "ax", @progbits + +/********************************** + * This is a prologue subroutine + **********************************/ +#if defined (L_prologue) + +DEFUN __prologue_saves__ + push r2 + push r3 + push r4 + push r5 + push r6 + push r7 + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push r16 + push r17 + push r28 + push r29 + in r28,__SP_L__ + in r29,__SP_H__ + sub r28,r26 + sbc r29,r27 + in __tmp_reg__,__SREG__ + cli + out __SP_H__,r29 + out __SREG__,__tmp_reg__ + out __SP_L__,r28 +#if defined (__AVR_HAVE_EIJMP_EICALL__) + eijmp +#else + ijmp +#endif + +ENDF __prologue_saves__ +#endif /* defined (L_prologue) */ + +/* + * This is an epilogue subroutine + */ +#if defined (L_epilogue) + +DEFUN __epilogue_restores__ + ldd r2,Y+18 + ldd r3,Y+17 + ldd r4,Y+16 + ldd r5,Y+15 + ldd r6,Y+14 + ldd r7,Y+13 + ldd r8,Y+12 + ldd r9,Y+11 + ldd r10,Y+10 + ldd r11,Y+9 + ldd r12,Y+8 + ldd r13,Y+7 + ldd r14,Y+6 + ldd r15,Y+5 + ldd r16,Y+4 + ldd r17,Y+3 + ldd r26,Y+2 + ldd r27,Y+1 + add r28,r30 + adc r29,__zero_reg__ + in __tmp_reg__,__SREG__ + cli + out __SP_H__,r29 + out __SREG__,__tmp_reg__ + out __SP_L__,r28 + mov_l r28, r26 + mov_h r29, r27 + ret +ENDF __epilogue_restores__ +#endif /* defined (L_epilogue) */ + +#ifdef L_exit + .section .fini9,"ax",@progbits +DEFUN _exit + .weak exit +exit: +ENDF _exit + + /* Code from .fini8 ... .fini1 sections inserted by ld script. */ + + .section .fini0,"ax",@progbits + cli +__stop_program: + rjmp __stop_program +#endif /* defined (L_exit) */ + +#ifdef L_cleanup + .weak _cleanup + .func _cleanup +_cleanup: + ret +.endfunc +#endif /* defined (L_cleanup) */ + + +.section .text.libgcc, "ax", @progbits + +#ifdef L_tablejump +DEFUN __tablejump2__ + lsl r30 + rol r31 + ;; FALLTHRU +ENDF __tablejump2__ + +DEFUN __tablejump__ +#if defined (__AVR_HAVE_LPMX__) + lpm __tmp_reg__, Z+ + lpm r31, Z + mov r30, __tmp_reg__ +#if defined (__AVR_HAVE_EIJMP_EICALL__) + eijmp +#else + ijmp +#endif + +#else /* !HAVE_LPMX */ + lpm + adiw r30, 1 + push r0 + lpm + push r0 +#if defined (__AVR_HAVE_EIJMP_EICALL__) + in __tmp_reg__, __EIND__ + push __tmp_reg__ +#endif + ret +#endif /* !HAVE_LPMX */ +ENDF __tablejump__ +#endif /* defined (L_tablejump) */ + +#ifdef L_copy_data + .section .init4,"ax",@progbits +DEFUN __do_copy_data +#if defined(__AVR_HAVE_ELPMX__) + ldi r17, hi8(__data_end) + ldi r26, lo8(__data_start) + ldi r27, hi8(__data_start) + ldi r30, lo8(__data_load_start) + ldi r31, hi8(__data_load_start) + ldi r16, hh8(__data_load_start) + out __RAMPZ__, r16 + rjmp .L__do_copy_data_start +.L__do_copy_data_loop: + elpm r0, Z+ + st X+, r0 +.L__do_copy_data_start: + cpi r26, lo8(__data_end) + cpc r27, r17 + brne .L__do_copy_data_loop +#elif !defined(__AVR_HAVE_ELPMX__) && defined(__AVR_HAVE_ELPM__) + ldi r17, hi8(__data_end) + ldi r26, lo8(__data_start) + ldi r27, hi8(__data_start) + ldi r30, lo8(__data_load_start) + ldi r31, hi8(__data_load_start) + ldi r16, hh8(__data_load_start - 0x10000) +.L__do_copy_data_carry: + inc r16 + out __RAMPZ__, r16 + rjmp .L__do_copy_data_start +.L__do_copy_data_loop: + elpm + st X+, r0 + adiw r30, 1 + brcs .L__do_copy_data_carry +.L__do_copy_data_start: + cpi r26, lo8(__data_end) + cpc r27, r17 + brne .L__do_copy_data_loop +#elif !defined(__AVR_HAVE_ELPMX__) && !defined(__AVR_HAVE_ELPM__) + ldi r17, hi8(__data_end) + ldi r26, lo8(__data_start) + ldi r27, hi8(__data_start) + ldi r30, lo8(__data_load_start) + ldi r31, hi8(__data_load_start) + rjmp .L__do_copy_data_start +.L__do_copy_data_loop: +#if defined (__AVR_HAVE_LPMX__) + lpm r0, Z+ +#else + lpm + adiw r30, 1 +#endif + st X+, r0 +.L__do_copy_data_start: + cpi r26, lo8(__data_end) + cpc r27, r17 + brne .L__do_copy_data_loop +#endif /* !defined(__AVR_HAVE_ELPMX__) && !defined(__AVR_HAVE_ELPM__) */ +ENDF __do_copy_data +#endif /* L_copy_data */ + +/* __do_clear_bss is only necessary if there is anything in .bss section. */ + +#ifdef L_clear_bss + .section .init4,"ax",@progbits +DEFUN __do_clear_bss + ldi r17, hi8(__bss_end) + ldi r26, lo8(__bss_start) + ldi r27, hi8(__bss_start) + rjmp .do_clear_bss_start +.do_clear_bss_loop: + st X+, __zero_reg__ +.do_clear_bss_start: + cpi r26, lo8(__bss_end) + cpc r27, r17 + brne .do_clear_bss_loop +ENDF __do_clear_bss +#endif /* L_clear_bss */ + +/* __do_global_ctors and __do_global_dtors are only necessary + if there are any constructors/destructors. */ + +#ifdef L_ctors + .section .init6,"ax",@progbits +DEFUN __do_global_ctors +#if defined(__AVR_HAVE_RAMPZ__) + ldi r17, hi8(__ctors_start) + ldi r28, lo8(__ctors_end) + ldi r29, hi8(__ctors_end) + ldi r16, hh8(__ctors_end) + rjmp .L__do_global_ctors_start +.L__do_global_ctors_loop: + sbiw r28, 2 + sbc r16, __zero_reg__ + mov_h r31, r29 + mov_l r30, r28 + out __RAMPZ__, r16 + XCALL __tablejump_elpm__ +.L__do_global_ctors_start: + cpi r28, lo8(__ctors_start) + cpc r29, r17 + ldi r24, hh8(__ctors_start) + cpc r16, r24 + brne .L__do_global_ctors_loop +#else + ldi r17, hi8(__ctors_start) + ldi r28, lo8(__ctors_end) + ldi r29, hi8(__ctors_end) + rjmp .L__do_global_ctors_start +.L__do_global_ctors_loop: + sbiw r28, 2 + mov_h r31, r29 + mov_l r30, r28 + XCALL __tablejump__ +.L__do_global_ctors_start: + cpi r28, lo8(__ctors_start) + cpc r29, r17 + brne .L__do_global_ctors_loop +#endif /* defined(__AVR_HAVE_RAMPZ__) */ +ENDF __do_global_ctors +#endif /* L_ctors */ + +#ifdef L_dtors + .section .fini6,"ax",@progbits +DEFUN __do_global_dtors +#if defined(__AVR_HAVE_RAMPZ__) + ldi r17, hi8(__dtors_end) + ldi r28, lo8(__dtors_start) + ldi r29, hi8(__dtors_start) + ldi r16, hh8(__dtors_start) + rjmp .L__do_global_dtors_start +.L__do_global_dtors_loop: + sbiw r28, 2 + sbc r16, __zero_reg__ + mov_h r31, r29 + mov_l r30, r28 + out __RAMPZ__, r16 + XCALL __tablejump_elpm__ +.L__do_global_dtors_start: + cpi r28, lo8(__dtors_end) + cpc r29, r17 + ldi r24, hh8(__dtors_end) + cpc r16, r24 + brne .L__do_global_dtors_loop +#else + ldi r17, hi8(__dtors_end) + ldi r28, lo8(__dtors_start) + ldi r29, hi8(__dtors_start) + rjmp .L__do_global_dtors_start +.L__do_global_dtors_loop: + mov_h r31, r29 + mov_l r30, r28 + XCALL __tablejump__ + adiw r28, 2 +.L__do_global_dtors_start: + cpi r28, lo8(__dtors_end) + cpc r29, r17 + brne .L__do_global_dtors_loop +#endif /* defined(__AVR_HAVE_RAMPZ__) */ +ENDF __do_global_dtors +#endif /* L_dtors */ + +.section .text.libgcc, "ax", @progbits + +#ifdef L_tablejump_elpm +DEFUN __tablejump_elpm__ +#if defined (__AVR_HAVE_ELPM__) +#if defined (__AVR_HAVE_LPMX__) + elpm __tmp_reg__, Z+ + elpm r31, Z + mov r30, __tmp_reg__ +#if defined (__AVR_HAVE_EIJMP_EICALL__) + eijmp +#else + ijmp +#endif + +#else + elpm + adiw r30, 1 + push r0 + elpm + push r0 +#if defined (__AVR_HAVE_EIJMP_EICALL__) + in __tmp_reg__, __EIND__ + push __tmp_reg__ +#endif + ret +#endif +#endif /* defined (__AVR_HAVE_ELPM__) */ +ENDF __tablejump_elpm__ +#endif /* defined (L_tablejump_elpm) */ + + +.section .text.libgcc.builtins, "ax", @progbits + +/********************************** + * Find first set Bit (ffs) + **********************************/ + +#if defined (L_ffssi2) +;; find first set bit +;; r25:r24 = ffs32 (r25:r22) +;; clobbers: r22, r26 +DEFUN __ffssi2 + clr r26 + tst r22 + brne 1f + subi r26, -8 + or r22, r23 + brne 1f + subi r26, -8 + or r22, r24 + brne 1f + subi r26, -8 + or r22, r25 + brne 1f + ret +1: mov r24, r22 + XJMP __loop_ffsqi2 +ENDF __ffssi2 +#endif /* defined (L_ffssi2) */ + +#if defined (L_ffshi2) +;; find first set bit +;; r25:r24 = ffs16 (r25:r24) +;; clobbers: r26 +DEFUN __ffshi2 + clr r26 +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + ;; Some cores have problem skipping 2-word instruction + tst r24 + breq 2f +#else + cpse r24, __zero_reg__ +#endif /* __AVR_HAVE_JMP_CALL__ */ +1: XJMP __loop_ffsqi2 +2: ldi r26, 8 + or r24, r25 + brne 1b + ret +ENDF __ffshi2 +#endif /* defined (L_ffshi2) */ + +#if defined (L_loop_ffsqi2) +;; Helper for ffshi2, ffssi2 +;; r25:r24 = r26 + zero_extend16 (ffs8(r24)) +;; r24 must be != 0 +;; clobbers: r26 +DEFUN __loop_ffsqi2 + inc r26 + lsr r24 + brcc __loop_ffsqi2 + mov r24, r26 + clr r25 + ret +ENDF __loop_ffsqi2 +#endif /* defined (L_loop_ffsqi2) */ + + +/********************************** + * Count trailing Zeros (ctz) + **********************************/ + +#if defined (L_ctzsi2) +;; count trailing zeros +;; r25:r24 = ctz32 (r25:r22) +;; clobbers: r26, r22 +;; ctz(0) = 255 +;; Note that ctz(0) in undefined for GCC +DEFUN __ctzsi2 + XCALL __ffssi2 + dec r24 + ret +ENDF __ctzsi2 +#endif /* defined (L_ctzsi2) */ + +#if defined (L_ctzhi2) +;; count trailing zeros +;; r25:r24 = ctz16 (r25:r24) +;; clobbers: r26 +;; ctz(0) = 255 +;; Note that ctz(0) in undefined for GCC +DEFUN __ctzhi2 + XCALL __ffshi2 + dec r24 + ret +ENDF __ctzhi2 +#endif /* defined (L_ctzhi2) */ + + +/********************************** + * Count leading Zeros (clz) + **********************************/ + +#if defined (L_clzdi2) +;; count leading zeros +;; r25:r24 = clz64 (r25:r18) +;; clobbers: r22, r23, r26 +DEFUN __clzdi2 + XCALL __clzsi2 + sbrs r24, 5 + ret + mov_l r22, r18 + mov_h r23, r19 + mov_l r24, r20 + mov_h r25, r21 + XCALL __clzsi2 + subi r24, -32 + ret +ENDF __clzdi2 +#endif /* defined (L_clzdi2) */ + +#if defined (L_clzsi2) +;; count leading zeros +;; r25:r24 = clz32 (r25:r22) +;; clobbers: r26 +DEFUN __clzsi2 + XCALL __clzhi2 + sbrs r24, 4 + ret + mov_l r24, r22 + mov_h r25, r23 + XCALL __clzhi2 + subi r24, -16 + ret +ENDF __clzsi2 +#endif /* defined (L_clzsi2) */ + +#if defined (L_clzhi2) +;; count leading zeros +;; r25:r24 = clz16 (r25:r24) +;; clobbers: r26 +DEFUN __clzhi2 + clr r26 + tst r25 + brne 1f + subi r26, -8 + or r25, r24 + brne 1f + ldi r24, 16 + ret +1: cpi r25, 16 + brsh 3f + subi r26, -3 + swap r25 +2: inc r26 +3: lsl r25 + brcc 2b + mov r24, r26 + clr r25 + ret +ENDF __clzhi2 +#endif /* defined (L_clzhi2) */ + + +/********************************** + * Parity + **********************************/ + +#if defined (L_paritydi2) +;; r25:r24 = parity64 (r25:r18) +;; clobbers: __tmp_reg__ +DEFUN __paritydi2 + eor r24, r18 + eor r24, r19 + eor r24, r20 + eor r24, r21 + XJMP __paritysi2 +ENDF __paritydi2 +#endif /* defined (L_paritydi2) */ + +#if defined (L_paritysi2) +;; r25:r24 = parity32 (r25:r22) +;; clobbers: __tmp_reg__ +DEFUN __paritysi2 + eor r24, r22 + eor r24, r23 + XJMP __parityhi2 +ENDF __paritysi2 +#endif /* defined (L_paritysi2) */ + +#if defined (L_parityhi2) +;; r25:r24 = parity16 (r25:r24) +;; clobbers: __tmp_reg__ +DEFUN __parityhi2 + eor r24, r25 +;; FALLTHRU +ENDF __parityhi2 + +;; r25:r24 = parity8 (r24) +;; clobbers: __tmp_reg__ +DEFUN __parityqi2 + ;; parity is in r24[0..7] + mov __tmp_reg__, r24 + swap __tmp_reg__ + eor r24, __tmp_reg__ + ;; parity is in r24[0..3] + subi r24, -4 + andi r24, -5 + subi r24, -6 + ;; parity is in r24[0,3] + sbrc r24, 3 + inc r24 + ;; parity is in r24[0] + andi r24, 1 + clr r25 + ret +ENDF __parityqi2 +#endif /* defined (L_parityhi2) */ + + +/********************************** + * Population Count + **********************************/ + +#if defined (L_popcounthi2) +;; population count +;; r25:r24 = popcount16 (r25:r24) +;; clobbers: __tmp_reg__ +DEFUN __popcounthi2 + XCALL __popcountqi2 + push r24 + mov r24, r25 + XCALL __popcountqi2 + clr r25 + ;; FALLTHRU +ENDF __popcounthi2 + +DEFUN __popcounthi2_tail + pop __tmp_reg__ + add r24, __tmp_reg__ + ret +ENDF __popcounthi2_tail +#endif /* defined (L_popcounthi2) */ + +#if defined (L_popcountsi2) +;; population count +;; r25:r24 = popcount32 (r25:r22) +;; clobbers: __tmp_reg__ +DEFUN __popcountsi2 + XCALL __popcounthi2 + push r24 + mov_l r24, r22 + mov_h r25, r23 + XCALL __popcounthi2 + XJMP __popcounthi2_tail +ENDF __popcountsi2 +#endif /* defined (L_popcountsi2) */ + +#if defined (L_popcountdi2) +;; population count +;; r25:r24 = popcount64 (r25:r18) +;; clobbers: r22, r23, __tmp_reg__ +DEFUN __popcountdi2 + XCALL __popcountsi2 + push r24 + mov_l r22, r18 + mov_h r23, r19 + mov_l r24, r20 + mov_h r25, r21 + XCALL __popcountsi2 + XJMP __popcounthi2_tail +ENDF __popcountdi2 +#endif /* defined (L_popcountdi2) */ + +#if defined (L_popcountqi2) +;; population count +;; r24 = popcount8 (r24) +;; clobbers: __tmp_reg__ +DEFUN __popcountqi2 + mov __tmp_reg__, r24 + andi r24, 1 + lsr __tmp_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __zero_reg__ + lsr __tmp_reg__ + adc r24, __tmp_reg__ + ret +ENDF __popcountqi2 +#endif /* defined (L_popcountqi2) */ + + +/********************************** + * Swap bytes + **********************************/ + +;; swap two registers with different register number +.macro bswap a, b + eor \a, \b + eor \b, \a + eor \a, \b +.endm + +#if defined (L_bswapsi2) +;; swap bytes +;; r25:r22 = bswap32 (r25:r22) +DEFUN __bswapsi2 + bswap r22, r25 + bswap r23, r24 + ret +ENDF __bswapsi2 +#endif /* defined (L_bswapsi2) */ + +#if defined (L_bswapdi2) +;; swap bytes +;; r25:r18 = bswap64 (r25:r18) +DEFUN __bswapdi2 + bswap r18, r25 + bswap r19, r24 + bswap r20, r23 + bswap r21, r22 + ret +ENDF __bswapdi2 +#endif /* defined (L_bswapdi2) */ + + +/********************************** + * 64-bit shifts + **********************************/ + +#if defined (L_ashrdi3) +;; Arithmetic shift right +;; r25:r18 = ashr64 (r25:r18, r17:r16) +DEFUN __ashrdi3 + push r16 + andi r16, 63 + breq 2f +1: asr r25 + ror r24 + ror r23 + ror r22 + ror r21 + ror r20 + ror r19 + ror r18 + dec r16 + brne 1b +2: pop r16 + ret +ENDF __ashrdi3 +#endif /* defined (L_ashrdi3) */ + +#if defined (L_lshrdi3) +;; Logic shift right +;; r25:r18 = lshr64 (r25:r18, r17:r16) +DEFUN __lshrdi3 + push r16 + andi r16, 63 + breq 2f +1: lsr r25 + ror r24 + ror r23 + ror r22 + ror r21 + ror r20 + ror r19 + ror r18 + dec r16 + brne 1b +2: pop r16 + ret +ENDF __lshrdi3 +#endif /* defined (L_lshrdi3) */ + +#if defined (L_ashldi3) +;; Shift left +;; r25:r18 = ashl64 (r25:r18, r17:r16) +DEFUN __ashldi3 + push r16 + andi r16, 63 + breq 2f +1: lsl r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 + dec r16 + brne 1b +2: pop r16 + ret +ENDF __ashldi3 +#endif /* defined (L_ashldi3) */ + + +.section .text.libgcc.fmul, "ax", @progbits + +/***********************************************************/ +;;; Softmul versions of FMUL, FMULS and FMULSU to implement +;;; __builtin_avr_fmul* if !AVR_HAVE_MUL +/***********************************************************/ + +#define A1 24 +#define B1 25 +#define C0 22 +#define C1 23 +#define A0 __tmp_reg__ + +#ifdef L_fmuls +;;; r23:r22 = fmuls (r24, r25) like in FMULS instruction +;;; Clobbers: r24, r25, __tmp_reg__ +DEFUN __fmuls + ;; A0.7 = negate result? + mov A0, A1 + eor A0, B1 + ;; B1 = |B1| + sbrc B1, 7 + neg B1 + XJMP __fmulsu_exit +ENDF __fmuls +#endif /* L_fmuls */ + +#ifdef L_fmulsu +;;; r23:r22 = fmulsu (r24, r25) like in FMULSU instruction +;;; Clobbers: r24, r25, __tmp_reg__ +DEFUN __fmulsu + ;; A0.7 = negate result? + mov A0, A1 +;; FALLTHRU +ENDF __fmulsu + +;; Helper for __fmuls and __fmulsu +DEFUN __fmulsu_exit + ;; A1 = |A1| + sbrc A1, 7 + neg A1 +#ifdef __AVR_ERRATA_SKIP_JMP_CALL__ + ;; Some cores have problem skipping 2-word instruction + tst A0 + brmi 1f +#else + sbrs A0, 7 +#endif /* __AVR_HAVE_JMP_CALL__ */ + XJMP __fmul +1: XCALL __fmul + ;; C = -C iff A0.7 = 1 + com C1 + neg C0 + sbci C1, -1 + ret +ENDF __fmulsu_exit +#endif /* L_fmulsu */ + + +#ifdef L_fmul +;;; r22:r23 = fmul (r24, r25) like in FMUL instruction +;;; Clobbers: r24, r25, __tmp_reg__ +DEFUN __fmul + ; clear result + clr C0 + clr C1 + clr A0 +1: tst B1 + ;; 1.0 = 0x80, so test for bit 7 of B to see if A must to be added to C. +2: brpl 3f + ;; C += A + add C0, A0 + adc C1, A1 +3: ;; A >>= 1 + lsr A1 + ror A0 + ;; B <<= 1 + lsl B1 + brne 2b + ret +ENDF __fmul +#endif /* L_fmul */ + +#undef A0 +#undef A1 +#undef B1 +#undef C0 +#undef C1 diff --git a/libgcc/config/avr/t-avr b/libgcc/config/avr/t-avr index 78829c76af4..f1c114a6dd6 100644 --- a/libgcc/config/avr/t-avr +++ b/libgcc/config/avr/t-avr @@ -1,3 +1,51 @@ +LIB1ASMSRC = avr/lib1funcs.S +LIB1ASMFUNCS = \ + _mulqi3 \ + _mulhi3 \ + _mulhisi3 \ + _umulhisi3 \ + _usmulhisi3 \ + _muluhisi3 \ + _mulshisi3 \ + _mulsi3 \ + _udivmodqi4 \ + _divmodqi4 \ + _udivmodhi4 \ + _divmodhi4 \ + _udivmodsi4 \ + _divmodsi4 \ + _prologue \ + _epilogue \ + _exit \ + _cleanup \ + _tablejump \ + _tablejump_elpm \ + _copy_data \ + _clear_bss \ + _ctors \ + _dtors \ + _ffssi2 \ + _ffshi2 \ + _loop_ffsqi2 \ + _ctzsi2 \ + _ctzhi2 \ + _clzdi2 \ + _clzsi2 \ + _clzhi2 \ + _paritydi2 \ + _paritysi2 \ + _parityhi2 \ + _popcounthi2 \ + _popcountsi2 \ + _popcountdi2 \ + _popcountqi2 \ + _bswapsi2 \ + _bswapdi2 \ + _ashldi3 \ + _ashrdi3 \ + _lshrdi3 \ + _fmul _fmuls _fmulsu + # Extra 16-bit integer functions. intfuncs16 = _absvXX2 _addvXX3 _subvXX3 _mulvXX3 _negvXX2 _clrsbXX2 diff --git a/libgcc/config/bfin/lib1funcs.S b/libgcc/config/bfin/lib1funcs.S new file mode 100644 index 00000000000..c7bf4f3f05c --- /dev/null +++ b/libgcc/config/bfin/lib1funcs.S @@ -0,0 +1,211 @@ +/* libgcc functions for Blackfin. + Copyright (C) 2005, 2009 Free Software Foundation, Inc. + Contributed by Analog Devices. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#ifdef L_divsi3 +.text +.align 2 +.global ___divsi3; +.type ___divsi3, STT_FUNC; + +___divsi3: + [--SP]= RETS; + [--SP] = R7; + + R2 = -R0; + CC = R0 < 0; + IF CC R0 = R2; + R7 = CC; + + R2 = -R1; + CC = R1 < 0; + IF CC R1 = R2; + R2 = CC; + R7 = R7 ^ R2; + + CALL ___udivsi3; + + CC = R7; + R1 = -R0; + IF CC R0 = R1; + + R7 = [SP++]; + RETS = [SP++]; + RTS; +#endif + +#ifdef L_modsi3 +.align 2 +.global ___modsi3; +.type ___modsi3, STT_FUNC; + +___modsi3: + [--SP] = RETS; + [--SP] = R0; + [--SP] = R1; + CALL ___divsi3; + R2 = [SP++]; + R1 = [SP++]; + R2 *= R0; + R0 = R1 - R2; + RETS = [SP++]; + RTS; +#endif + +#ifdef L_udivsi3 +.align 2 +.global ___udivsi3; +.type ___udivsi3, STT_FUNC; + +___udivsi3: + P0 = 32; + LSETUP (0f, 1f) LC0 = P0; + /* upper half of dividend */ + R3 = 0; +0: + /* The first time round in the loop we shift in garbage, but since we + perform 33 shifts, it doesn't matter. */ + R0 = ROT R0 BY 1; + R3 = ROT R3 BY 1; + R2 = R3 - R1; + CC = R3 < R1 (IU); +1: + /* Last instruction of the loop. */ + IF ! CC R3 = R2; + + /* Shift in the last bit. */ + R0 = ROT R0 BY 1; + /* R0 is the result, R3 contains the remainder. */ + R0 = ~ R0; + RTS; +#endif + +#ifdef L_umodsi3 +.align 2 +.global ___umodsi3; +.type ___umodsi3, STT_FUNC; + +___umodsi3: + [--SP] = RETS; + CALL ___udivsi3; + R0 = R3; + RETS = [SP++]; + RTS; +#endif + +#ifdef L_umulsi3_highpart +.align 2 +.global ___umulsi3_highpart; +.type ___umulsi3_highpart, STT_FUNC; + +___umulsi3_highpart: + A1 = R1.L * R0.L (FU); + A1 = A1 >> 16; + A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU); + A1 += R0.L * R1.H (FU); + A1 = A1 >> 16; + A0 += A1; + R0 = A0 (FU); + RTS; +#endif + +#ifdef L_smulsi3_highpart +.align 2 +.global ___smulsi3_highpart; +.type ___smulsi3_highpart, STT_FUNC; + +___smulsi3_highpart: + A1 = R1.L * R0.L (FU); + A1 = A1 >> 16; + A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M); + A1 += R1.H * R0.L (IS,M); + A1 = A1 >>> 16; + R0 = (A0 += A1); + RTS; +#endif + +#ifdef L_muldi3 +.align 2 +.global ___muldi3; +.type ___muldi3, STT_FUNC; + +/* + R1:R0 * R3:R2 + = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l +[X] = (R1.h * R3.h) * 2^96 +[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80 +[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64 +[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48 +[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32 +[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16 +[T4] + (R0.l * R2.l) + + We can discard the first three lines marked "X" since we produce + only a 64 bit result. So, we need ten 16-bit multiplies. + + Individual mul-acc results: +[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h +[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h +[E3] = R0.l * R2.h + R2.l * R0.h +[E4] = R0.l * R2.l + + We also need to add high parts from lower-level results to higher ones: + E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4 + + One interesting property is that all parts of the result that depend + on the sign of the multiplication are discarded. Those would be the + multiplications involving R1.h and R3.h, but only the top 16 bit of + the 32 bit result depend on the sign, and since R1.h and R3.h only + occur in E1, the top half of these results is cut off. + So, we can just use FU mode for all of the 16-bit multiplies, and + ignore questions of when to use mixed mode. */ + +___muldi3: + /* [SP] technically is part of the caller's frame, but we can + use it as scratch space. */ + A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */ + A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */ + A0 += A1; /* E1 */ + R4 = A0.w; + A0 = R0.l * R3.l (FU); /* E2 */ + A0 += R2.l * R1.l (FU); /* E2 */ + + A1 = R2.L * R0.L (FU); /* E4 */ + R3 = A1.w; + A1 = A1 >> 16; /* E3c */ + A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */ + A1 += R0.L * R2.H (FU); /* E3c */ + R0 = A1.w; + A1 = A1 >> 16; /* E2c */ + A0 += A1; /* E2c */ + R1 = A0.w; + + /* low(result) = low(E3c):low(E4) */ + R0 = PACK (R0.l, R3.l); + /* high(result) = E2c + (E1 << 16) */ + R1.h = R1.h + R4.l (NS) || R4 = [SP]; + RTS; + +.size ___muldi3, .-___muldi3 +#endif diff --git a/libgcc/config/bfin/t-bfin b/libgcc/config/bfin/t-bfin new file mode 100644 index 00000000000..bc2b088ffc1 --- /dev/null +++ b/libgcc/config/bfin/t-bfin @@ -0,0 +1,3 @@ +LIB1ASMSRC = bfin/lib1funcs.S +LIB1ASMFUNCS = _divsi3 _udivsi3 _umodsi3 _modsi3 _muldi3 _umulsi3_highpart +LIB1ASMFUNCS += _smulsi3_highpart diff --git a/libgcc/config/c6x/lib1funcs.S b/libgcc/config/c6x/lib1funcs.S new file mode 100644 index 00000000000..5bf34474bbd --- /dev/null +++ b/libgcc/config/c6x/lib1funcs.S @@ -0,0 +1,438 @@ +/* Copyright 2010, 2011 Free Software Foundation, Inc. + Contributed by Bernd Schmidt <bernds@codesourcery.com>. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + + ;; ABI considerations for the divide functions + ;; The following registers are call-used: + ;; __c6xabi_divi A0,A1,A2,A4,A6,B0,B1,B2,B4,B5 + ;; __c6xabi_divu A0,A1,A2,A4,A6,B0,B1,B2,B4 + ;; __c6xabi_remi A1,A2,A4,A5,A6,B0,B1,B2,B4 + ;; __c6xabi_remu A1,A4,A5,A7,B0,B1,B2,B4 + ;; + ;; In our implementation, divu and remu are leaf functions, + ;; while both divi and remi call into divu. + ;; A0 is not clobbered by any of the functions. + ;; divu does not clobber B2 either, which is taken advantage of + ;; in remi. + ;; divi uses B5 to hold the original return address during + ;; the call to divu. + ;; remi uses B2 and A5 to hold the input values during the + ;; call to divu. It stores B3 in on the stack. + +#ifdef L_divsi3 +.text +.align 2 +.global __c6xabi_divi +.hidden __c6xabi_divi +.type __c6xabi_divi, STT_FUNC + +__c6xabi_divi: + call .s2 __c6xabi_divu +|| mv .d2 B3, B5 +|| cmpgt .l1 0, A4, A1 +|| cmpgt .l2 0, B4, B1 + + [A1] neg .l1 A4, A4 +|| [B1] neg .l2 B4, B4 +|| xor .s1x A1, B1, A1 + +#ifdef _TMS320C6400 + [A1] addkpc .s2 1f, B3, 4 +#else + [A1] mvkl .s2 1f, B3 + [A1] mvkh .s2 1f, B3 + nop 2 +#endif +1: + neg .l1 A4, A4 +|| mv .l2 B3,B5 +|| ret .s2 B5 + nop 5 +#endif + +#if defined L_modsi3 || defined L_divmodsi4 +.align 2 +#ifdef L_modsi3 +#define MOD_OUTPUT_REG A4 +.global __c6xabi_remi +.hidden __c6xabi_remi +.type __c6xabi_remi, STT_FUNC +#else +#define MOD_OUTPUT_REG A5 +.global __c6xabi_divremi +.hidden __c6xabi_divremi +.type __c6xabi_divremi, STT_FUNC +__c6xabi_divremi: +#endif + +__c6xabi_remi: + stw .d2t2 B3, *B15--[2] +|| cmpgt .l1 0, A4, A1 +|| cmpgt .l2 0, B4, B2 +|| mv .s1 A4, A5 +|| call .s2 __c6xabi_divu + + [A1] neg .l1 A4, A4 +|| [B2] neg .l2 B4, B4 +|| xor .s2x B2, A1, B0 +|| mv .d2 B4, B2 + +#ifdef _TMS320C6400 + [B0] addkpc .s2 1f, B3, 1 + [!B0] addkpc .s2 2f, B3, 1 + nop 2 +#else + [B0] mvkl .s2 1f,B3 + [!B0] mvkl .s2 2f,B3 + + [B0] mvkh .s2 1f,B3 + [!B0] mvkh .s2 2f,B3 +#endif +1: + neg .l1 A4, A4 +2: + ldw .d2t2 *++B15[2], B3 + +#ifdef _TMS320C6400_PLUS + mpy32 .m1x A4, B2, A6 + nop 3 + ret .s2 B3 + sub .l1 A5, A6, MOD_OUTPUT_REG + nop 4 +#else + mpyu .m1x A4, B2, A1 + nop 1 + mpylhu .m1x A4, B2, A6 +|| mpylhu .m2x B2, A4, B2 + nop 1 + add .l1x A6, B2, A6 +|| ret .s2 B3 + shl .s1 A6, 16, A6 + add .d1 A6, A1, A6 + sub .l1 A5, A6, MOD_OUTPUT_REG + nop 2 +#endif + +#endif + +#if defined L_udivsi3 || defined L_udivmodsi4 +.align 2 +#ifdef L_udivsi3 +.global __c6xabi_divu +.hidden __c6xabi_divu +.type __c6xabi_divu, STT_FUNC +__c6xabi_divu: +#else +.global __c6xabi_divremu +.hidden __c6xabi_divremu +.type __c6xabi_divremu, STT_FUNC +__c6xabi_divremu: +#endif + ;; We use a series of up to 31 subc instructions. First, we find + ;; out how many leading zero bits there are in the divisor. This + ;; gives us both a shift count for aligning (shifting) the divisor + ;; to the, and the number of times we have to execute subc. + + ;; At the end, we have both the remainder and most of the quotient + ;; in A4. The top bit of the quotient is computed first and is + ;; placed in A2. + + ;; Return immediately if the dividend is zero. Setting B4 to 1 + ;; is a trick to allow us to leave the following insns in the jump + ;; delay slot without affecting the result. + mv .s2x A4, B1 + +#ifndef _TMS320C6400 +[!b1] mvk .s2 1, B4 +#endif +[b1] lmbd .l2 1, B4, B1 +||[!b1] b .s2 B3 ; RETURN A +#ifdef _TMS320C6400 +||[!b1] mvk .d2 1, B4 +#endif +#ifdef L_udivmodsi4 +||[!b1] zero .s1 A5 +#endif + mv .l1x B1, A6 +|| shl .s2 B4, B1, B4 + + ;; The loop performs a maximum of 28 steps, so we do the + ;; first 3 here. + cmpltu .l1x A4, B4, A2 +[!A2] sub .l1x A4, B4, A4 +|| shru .s2 B4, 1, B4 +|| xor .s1 1, A2, A2 + + shl .s1 A2, 31, A2 +|| [b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 + + ;; RETURN A may happen here (note: must happen before the next branch) +0: + cmpgt .l2 B1, 7, B0 +|| [b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +|| [b0] b .s1 0b +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 + ;; loop backwards branch happens here + + ret .s2 B3 +|| mvk .s1 32, A1 + sub .l1 A1, A6, A6 +#ifdef L_udivmodsi4 +|| extu .s1 A4, A6, A5 +#endif + shl .s1 A4, A6, A4 + shru .s1 A4, 1, A4 +|| sub .l1 A6, 1, A6 + or .l1 A2, A4, A4 + shru .s1 A4, A6, A4 + nop + +#endif + +#ifdef L_umodsi3 +.align 2 +.global __c6xabi_remu +.hidden __c6xabi_remu +.type __c6xabi_remu, STT_FUNC +__c6xabi_remu: + ;; The ABI seems designed to prevent these functions calling each other, + ;; so we duplicate most of the divsi3 code here. + mv .s2x A4, B1 +#ifndef _TMS320C6400 +[!b1] mvk .s2 1, B4 +#endif + lmbd .l2 1, B4, B1 +||[!b1] b .s2 B3 ; RETURN A +#ifdef _TMS320C6400 +||[!b1] mvk .d2 1, B4 +#endif + + mv .l1x B1, A7 +|| shl .s2 B4, B1, B4 + + cmpltu .l1x A4, B4, A1 +[!a1] sub .l1x A4, B4, A4 + shru .s2 B4, 1, B4 + +0: + cmpgt .l2 B1, 7, B0 +|| [b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 + ;; RETURN A may happen here (note: must happen before the next branch) +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +|| [b0] b .s1 0b +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 + ;; loop backwards branch happens here + + ret .s2 B3 +[b1] subc .l1x A4,B4,A4 +|| [b1] add .s2 -1, B1, B1 +[b1] subc .l1x A4,B4,A4 + + extu .s1 A4, A7, A4 + nop 2 +#endif + +#if defined L_strasgi_64plus && defined _TMS320C6400_PLUS + +.align 2 +.global __c6xabi_strasgi_64plus +.hidden __c6xabi_strasgi_64plus +.type __c6xabi_strasgi_64plus, STT_FUNC +__c6xabi_strasgi_64plus: + shru .s2x a6, 2, b31 +|| mv .s1 a4, a30 +|| mv .d2 b4, b30 + + add .s2 -4, b31, b31 + + sploopd 1 +|| mvc .s2 b31, ilc + ldw .d2t2 *b30++, b31 + nop 4 + mv .s1x b31,a31 + spkernel 6, 0 +|| stw .d1t1 a31, *a30++ + + ret .s2 b3 + nop 5 +#endif + +#ifdef L_strasgi +.global __c6xabi_strasgi +.type __c6xabi_strasgi, STT_FUNC +__c6xabi_strasgi: + ;; This is essentially memcpy, with alignment known to be at least + ;; 4, and the size a multiple of 4 greater than or equal to 28. + ldw .d2t1 *B4++, A0 +|| mvk .s2 16, B1 + ldw .d2t1 *B4++, A1 +|| mvk .s2 20, B2 +|| sub .d1 A6, 24, A6 + ldw .d2t1 *B4++, A5 + ldw .d2t1 *B4++, A7 +|| mv .l2x A6, B7 + ldw .d2t1 *B4++, A8 + ldw .d2t1 *B4++, A9 +|| mv .s2x A0, B5 +|| cmpltu .l2 B2, B7, B0 + +0: + stw .d1t2 B5, *A4++ +||[b0] ldw .d2t1 *B4++, A0 +|| mv .s2x A1, B5 +|| mv .l2 B7, B6 + +[b0] sub .d2 B6, 24, B7 +||[b0] b .s2 0b +|| cmpltu .l2 B1, B6, B0 + +[b0] ldw .d2t1 *B4++, A1 +|| stw .d1t2 B5, *A4++ +|| mv .s2x A5, B5 +|| cmpltu .l2 12, B6, B0 + +[b0] ldw .d2t1 *B4++, A5 +|| stw .d1t2 B5, *A4++ +|| mv .s2x A7, B5 +|| cmpltu .l2 8, B6, B0 + +[b0] ldw .d2t1 *B4++, A7 +|| stw .d1t2 B5, *A4++ +|| mv .s2x A8, B5 +|| cmpltu .l2 4, B6, B0 + +[b0] ldw .d2t1 *B4++, A8 +|| stw .d1t2 B5, *A4++ +|| mv .s2x A9, B5 +|| cmpltu .l2 0, B6, B0 + +[b0] ldw .d2t1 *B4++, A9 +|| stw .d1t2 B5, *A4++ +|| mv .s2x A0, B5 +|| cmpltu .l2 B2, B7, B0 + + ;; loop back branch happens here + + cmpltu .l2 B1, B6, B0 +|| ret .s2 b3 + +[b0] stw .d1t1 A1, *A4++ +|| cmpltu .l2 12, B6, B0 +[b0] stw .d1t1 A5, *A4++ +|| cmpltu .l2 8, B6, B0 +[b0] stw .d1t1 A7, *A4++ +|| cmpltu .l2 4, B6, B0 +[b0] stw .d1t1 A8, *A4++ +|| cmpltu .l2 0, B6, B0 +[b0] stw .d1t1 A9, *A4++ + + ;; return happens here + +#endif + +#ifdef _TMS320C6400_PLUS +#ifdef L_push_rts +.align 2 +.global __c6xabi_push_rts +.hidden __c6xabi_push_rts +.type __c6xabi_push_rts, STT_FUNC +__c6xabi_push_rts: + stw .d2t2 B14, *B15--[2] + stdw .d2t1 A15:A14, *B15-- +|| b .s2x A3 + stdw .d2t2 B13:B12, *B15-- + stdw .d2t1 A13:A12, *B15-- + stdw .d2t2 B11:B10, *B15-- + stdw .d2t1 A11:A10, *B15-- + stdw .d2t2 B3:B2, *B15-- +#endif + +#ifdef L_pop_rts +.align 2 +.global __c6xabi_pop_rts +.hidden __c6xabi_pop_rts +.type __c6xabi_pop_rts, STT_FUNC +__c6xabi_pop_rts: + lddw .d2t2 *++B15, B3:B2 + lddw .d2t1 *++B15, A11:A10 + lddw .d2t2 *++B15, B11:B10 + lddw .d2t1 *++B15, A13:A12 + lddw .d2t2 *++B15, B13:B12 + lddw .d2t1 *++B15, A15:A14 +|| b .s2 B3 + ldw .d2t2 *++B15[2], B14 + nop 4 +#endif + +#ifdef L_call_stub +.align 2 +.global __c6xabi_call_stub +.type __c6xabi_call_stub, STT_FUNC +__c6xabi_call_stub: + stw .d2t1 A2, *B15--[2] + stdw .d2t1 A7:A6, *B15-- +|| call .s2 B31 + stdw .d2t1 A1:A0, *B15-- + stdw .d2t2 B7:B6, *B15-- + stdw .d2t2 B5:B4, *B15-- + stdw .d2t2 B1:B0, *B15-- + stdw .d2t2 B3:B2, *B15-- +|| addkpc .s2 1f, B3, 0 +1: + lddw .d2t2 *++B15, B3:B2 + lddw .d2t2 *++B15, B1:B0 + lddw .d2t2 *++B15, B5:B4 + lddw .d2t2 *++B15, B7:B6 + lddw .d2t1 *++B15, A1:A0 + lddw .d2t1 *++B15, A7:A6 +|| b .s2 B3 + ldw .d2t1 *++B15[2], A2 + nop 4 +#endif + +#endif + diff --git a/libgcc/config/c6x/t-elf b/libgcc/config/c6x/t-elf index 99d0cd2d5ca..e01c4109e52 100644 --- a/libgcc/config/c6x/t-elf +++ b/libgcc/config/c6x/t-elf @@ -1,6 +1,11 @@ # Cannot use default rules due to $(CRTSTUFF_T_CFLAGS). CUSTOM_CRTIN = yes +LIB1ASMSRC = c6x/lib1funcs.S +LIB1ASMFUNCS = _divsi3 _udivsi3 _umodsi3 _modsi3 _udivmodsi4 _divmodsi4 +LIB1ASMFUNCS += _strasgi _strasgi_64plus _clzsi2 _clzdi2 _clz +LIB1ASMFUNCS += _push_rts _pop_rts _call_stub + # Assemble startup files. crti.o: $(srcdir)/config/c6x/crti.S $(crt_compile) -c $(CRTSTUFF_T_CFLAGS) $< diff --git a/libgcc/config/fr30/lib1funcs.S b/libgcc/config/fr30/lib1funcs.S new file mode 100644 index 00000000000..7c63453123a --- /dev/null +++ b/libgcc/config/fr30/lib1funcs.S @@ -0,0 +1,115 @@ +/* libgcc routines for the FR30. + Copyright (C) 1998, 1999, 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + + .macro FUNC_START name + .text + .globl __\name + .type __\name, @function +__\name: + .endm + + .macro FUNC_END name + .size __\name, . - __\name + .endm + + .macro DIV_BODY reg number + .if \number + DIV_BODY \reg, "\number - 1" + div1 \reg + .endif + .endm + +#ifdef L_udivsi3 +FUNC_START udivsi3 + ;; Perform an unsiged division of r4 / r5 and place the result in r4. + ;; Does not handle overflow yet... + mov r4, mdl + div0u r5 + DIV_BODY r5 32 + mov mdl, r4 + ret +FUNC_END udivsi3 +#endif /* L_udivsi3 */ + +#ifdef L_divsi3 +FUNC_START divsi3 + ;; Perform a siged division of r4 / r5 and place the result in r4. + ;; Does not handle overflow yet... + mov r4, mdl + div0s r5 + DIV_BODY r5 32 + div2 r5 + div3 + div4s + mov mdl, r4 + ret +FUNC_END divsi3 +#endif /* L_divsi3 */ + +#ifdef L_umodsi3 +FUNC_START umodsi3 + ;; Perform an unsiged division of r4 / r5 and places the remainder in r4. + ;; Does not handle overflow yet... + mov r4, mdl + div0u r5 + DIV_BODY r5 32 + mov mdh, r4 + ret +FUNC_END umodsi3 +#endif /* L_umodsi3 */ + +#ifdef L_modsi3 +FUNC_START modsi3 + ;; Perform a siged division of r4 / r5 and place the remainder in r4. + ;; Does not handle overflow yet... + mov r4, mdl + div0s r5 + DIV_BODY r5 32 + div2 r5 + div3 + div4s + mov mdh, r4 + ret +FUNC_END modsi3 +#endif /* L_modsi3 */ + +#ifdef L_negsi2 +FUNC_START negsi2 + ldi:8 #0, r0 + sub r4, r0 + mov r0, r4 + ret +FUNC_END negsi2 +#endif /* L_negsi2 */ + +#ifdef L_one_cmplsi2 +FUNC_START one_cmplsi2 + ldi:8 #0xff, r0 + extsb r0 + eor r0, r4 + ret +FUNC_END one_cmplsi2 +#endif /* L_one_cmplsi2 */ + + diff --git a/libgcc/config/fr30/t-fr30 b/libgcc/config/fr30/t-fr30 new file mode 100644 index 00000000000..ee5ed9a127e --- /dev/null +++ b/libgcc/config/fr30/t-fr30 @@ -0,0 +1,2 @@ +LIB1ASMSRC = fr30/lib1funcs.S +LIB1ASMFUNCS = _udivsi3 _divsi3 _umodsi3 _modsi3 diff --git a/libgcc/config/frv/lib1funcs.S b/libgcc/config/frv/lib1funcs.S new file mode 100644 index 00000000000..d1ffcab6133 --- /dev/null +++ b/libgcc/config/frv/lib1funcs.S @@ -0,0 +1,269 @@ +/* Library functions. + Copyright (C) 2000, 2003, 2008, 2009 Free Software Foundation, Inc. + Contributed by Red Hat, Inc. + + This file is part of GCC. + + GCC is free software ; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY ; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#include <frv-asm.h> + + +#ifdef L_cmpll +/* icc0 = __cmpll (long long a, long long b) */ + + .globl EXT(__cmpll) + .type EXT(__cmpll),@function + .text + .p2align 4 +EXT(__cmpll): + cmp gr8, gr10, icc0 + ckeq icc0, cc4 + P(ccmp) gr9, gr11, cc4, 1 + ret +.Lend: + .size EXT(__cmpll),.Lend-EXT(__cmpll) +#endif /* L_cmpll */ + +#ifdef L_cmpf +/* icc0 = __cmpf (float a, float b) */ +/* Note, because this function returns the result in ICC0, it means it can't + handle NaNs. */ + + .globl EXT(__cmpf) + .type EXT(__cmpf),@function + .text + .p2align 4 +EXT(__cmpf): +#ifdef __FRV_HARD_FLOAT__ /* floating point instructions available */ + movgf gr8, fr0 + P(movgf) gr9, fr1 + setlos #1, gr8 + fcmps fr0, fr1, fcc0 + P(fcklt) fcc0, cc0 + fckeq fcc0, cc1 + csub gr0, gr8, gr8, cc0, 1 + cmov gr0, gr8, cc1, 1 + cmpi gr8, 0, icc0 + ret +#else /* no floating point instructions available */ + movsg lr, gr4 + addi sp, #-16, sp + sti gr4, @(sp, 8) + st fp, @(sp, gr0) + mov sp, fp + call EXT(__cmpsf2) + cmpi gr8, #0, icc0 + ldi @(sp, 8), gr4 + movgs gr4, lr + ld @(sp,gr0), fp + addi sp, #16, sp + ret +#endif +.Lend: + .size EXT(__cmpf),.Lend-EXT(__cmpf) +#endif + +#ifdef L_cmpd +/* icc0 = __cmpd (double a, double b) */ +/* Note, because this function returns the result in ICC0, it means it can't + handle NaNs. */ + + .globl EXT(__cmpd) + .type EXT(__cmpd),@function + .text + .p2align 4 +EXT(__cmpd): + movsg lr, gr4 + addi sp, #-16, sp + sti gr4, @(sp, 8) + st fp, @(sp, gr0) + mov sp, fp + call EXT(__cmpdf2) + cmpi gr8, #0, icc0 + ldi @(sp, 8), gr4 + movgs gr4, lr + ld @(sp,gr0), fp + addi sp, #16, sp + ret +.Lend: + .size EXT(__cmpd),.Lend-EXT(__cmpd) +#endif + +#ifdef L_addll +/* gr8,gr9 = __addll (long long a, long long b) */ +/* Note, gcc will never call this function, but it is present in case an + ABI program calls it. */ + + .globl EXT(__addll) + .type EXT(__addll),@function + .text + .p2align +EXT(__addll): + addcc gr9, gr11, gr9, icc0 + addx gr8, gr10, gr8, icc0 + ret +.Lend: + .size EXT(__addll),.Lend-EXT(__addll) +#endif + +#ifdef L_subll +/* gr8,gr9 = __subll (long long a, long long b) */ +/* Note, gcc will never call this function, but it is present in case an + ABI program calls it. */ + + .globl EXT(__subll) + .type EXT(__subll),@function + .text + .p2align 4 +EXT(__subll): + subcc gr9, gr11, gr9, icc0 + subx gr8, gr10, gr8, icc0 + ret +.Lend: + .size EXT(__subll),.Lend-EXT(__subll) +#endif + +#ifdef L_andll +/* gr8,gr9 = __andll (long long a, long long b) */ +/* Note, gcc will never call this function, but it is present in case an + ABI program calls it. */ + + .globl EXT(__andll) + .type EXT(__andll),@function + .text + .p2align 4 +EXT(__andll): + P(and) gr9, gr11, gr9 + P2(and) gr8, gr10, gr8 + ret +.Lend: + .size EXT(__andll),.Lend-EXT(__andll) +#endif + +#ifdef L_orll +/* gr8,gr9 = __orll (long long a, long long b) */ +/* Note, gcc will never call this function, but it is present in case an + ABI program calls it. */ + + .globl EXT(__orll) + .type EXT(__orll),@function + .text + .p2align 4 +EXT(__orll): + P(or) gr9, gr11, gr9 + P2(or) gr8, gr10, gr8 + ret +.Lend: + .size EXT(__orll),.Lend-EXT(__orll) +#endif + +#ifdef L_xorll +/* gr8,gr9 = __xorll (long long a, long long b) */ +/* Note, gcc will never call this function, but it is present in case an + ABI program calls it. */ + + .globl EXT(__xorll) + .type EXT(__xorll),@function + .text + .p2align 4 +EXT(__xorll): + P(xor) gr9, gr11, gr9 + P2(xor) gr8, gr10, gr8 + ret +.Lend: + .size EXT(__xorll),.Lend-EXT(__xorll) +#endif + +#ifdef L_notll +/* gr8,gr9 = __notll (long long a) */ +/* Note, gcc will never call this function, but it is present in case an + ABI program calls it. */ + + .globl EXT(__notll) + .type EXT(__notll),@function + .text + .p2align 4 +EXT(__notll): + P(not) gr9, gr9 + P2(not) gr8, gr8 + ret +.Lend: + .size EXT(__notll),.Lend-EXT(__notll) +#endif + +#ifdef L_cmov +/* (void) __cmov (char *dest, const char *src, size_t len) */ +/* + * void __cmov (char *dest, const char *src, size_t len) + * { + * size_t i; + * + * if (dest < src || dest > src+len) + * { + * for (i = 0; i < len; i++) + * dest[i] = src[i]; + * } + * else + * { + * while (len-- > 0) + * dest[len] = src[len]; + * } + * } + */ + + .globl EXT(__cmov) + .type EXT(__cmov),@function + .text + .p2align 4 +EXT(__cmov): + P(cmp) gr8, gr9, icc0 + add gr9, gr10, gr4 + P(cmp) gr8, gr4, icc1 + bc icc0, 0, .Lfwd + bls icc1, 0, .Lback +.Lfwd: + /* move bytes in a forward direction */ + P(setlos) #0, gr5 + cmp gr0, gr10, icc0 + P(subi) gr9, #1, gr9 + P2(subi) gr8, #1, gr8 + bnc icc0, 0, .Lret +.Lfloop: + /* forward byte move loop */ + addi gr5, #1, gr5 + P(ldsb) @(gr9, gr5), gr4 + cmp gr5, gr10, icc0 + P(stb) gr4, @(gr8, gr5) + bc icc0, 0, .Lfloop + ret +.Lbloop: + /* backward byte move loop body */ + ldsb @(gr9,gr10),gr4 + stb gr4,@(gr8,gr10) +.Lback: + P(cmpi) gr10, #0, icc0 + addi gr10, #-1, gr10 + bne icc0, 0, .Lbloop +.Lret: + ret +.Lend: + .size EXT(__cmov),.Lend-EXT(__cmov) +#endif diff --git a/libgcc/config/frv/t-frv b/libgcc/config/frv/t-frv index b364a5a25b9..9773722d8e7 100644 --- a/libgcc/config/frv/t-frv +++ b/libgcc/config/frv/t-frv @@ -1,3 +1,6 @@ +LIB1ASMSRC = frv/lib1funcs.S +LIB1ASMFUNCS = _cmpll _cmpf _cmpd _addll _subll _andll _orll _xorll _notll _cmov + # Compile two additional files that are linked with every program # linked using GCC on systems using COFF or ELF, for the sake of C++ # constructors. diff --git a/libgcc/config/h8300/lib1funcs.S b/libgcc/config/h8300/lib1funcs.S new file mode 100644 index 00000000000..1b75b73269d --- /dev/null +++ b/libgcc/config/h8300/lib1funcs.S @@ -0,0 +1,838 @@ +;; libgcc routines for the Renesas H8/300 CPU. +;; Contributed by Steve Chamberlain <sac@cygnus.com> +;; Optimizations by Toshiyasu Morita <toshiyasu.morita@renesas.com> + +/* Copyright (C) 1994, 2000, 2001, 2002, 2003, 2004, 2009 + Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +/* Assembler register definitions. */ + +#define A0 r0 +#define A0L r0l +#define A0H r0h + +#define A1 r1 +#define A1L r1l +#define A1H r1h + +#define A2 r2 +#define A2L r2l +#define A2H r2h + +#define A3 r3 +#define A3L r3l +#define A3H r3h + +#define S0 r4 +#define S0L r4l +#define S0H r4h + +#define S1 r5 +#define S1L r5l +#define S1H r5h + +#define S2 r6 +#define S2L r6l +#define S2H r6h + +#ifdef __H8300__ +#define PUSHP push +#define POPP pop + +#define A0P r0 +#define A1P r1 +#define A2P r2 +#define A3P r3 +#define S0P r4 +#define S1P r5 +#define S2P r6 +#endif + +#if defined (__H8300H__) || defined (__H8300S__) || defined (__H8300SX__) +#define PUSHP push.l +#define POPP pop.l + +#define A0P er0 +#define A1P er1 +#define A2P er2 +#define A3P er3 +#define S0P er4 +#define S1P er5 +#define S2P er6 + +#define A0E e0 +#define A1E e1 +#define A2E e2 +#define A3E e3 +#endif + +#ifdef __H8300H__ +#ifdef __NORMAL_MODE__ + .h8300hn +#else + .h8300h +#endif +#endif + +#ifdef __H8300S__ +#ifdef __NORMAL_MODE__ + .h8300sn +#else + .h8300s +#endif +#endif +#ifdef __H8300SX__ +#ifdef __NORMAL_MODE__ + .h8300sxn +#else + .h8300sx +#endif +#endif + +#ifdef L_cmpsi2 +#ifdef __H8300__ + .section .text + .align 2 + .global ___cmpsi2 +___cmpsi2: + cmp.w A0,A2 + bne .L2 + cmp.w A1,A3 + bne .L4 + mov.w #1,A0 + rts +.L2: + bgt .L5 +.L3: + mov.w #2,A0 + rts +.L4: + bls .L3 +.L5: + sub.w A0,A0 + rts + .end +#endif +#endif /* L_cmpsi2 */ + +#ifdef L_ucmpsi2 +#ifdef __H8300__ + .section .text + .align 2 + .global ___ucmpsi2 +___ucmpsi2: + cmp.w A0,A2 + bne .L2 + cmp.w A1,A3 + bne .L4 + mov.w #1,A0 + rts +.L2: + bhi .L5 +.L3: + mov.w #2,A0 + rts +.L4: + bls .L3 +.L5: + sub.w A0,A0 + rts + .end +#endif +#endif /* L_ucmpsi2 */ + +#ifdef L_divhi3 + +;; HImode divides for the H8/300. +;; We bunch all of this into one object file since there are several +;; "supporting routines". + +; general purpose normalize routine +; +; divisor in A0 +; dividend in A1 +; turns both into +ve numbers, and leaves what the answer sign +; should be in A2L + +#ifdef __H8300__ + .section .text + .align 2 +divnorm: + or A0H,A0H ; is divisor > 0 + stc ccr,A2L + bge _lab1 + not A0H ; no - then make it +ve + not A0L + adds #1,A0 +_lab1: or A1H,A1H ; look at dividend + bge _lab2 + not A1H ; it is -ve, make it positive + not A1L + adds #1,A1 + xor #0x8,A2L; and toggle sign of result +_lab2: rts +;; Basically the same, except that the sign of the divisor determines +;; the sign. +modnorm: + or A0H,A0H ; is divisor > 0 + stc ccr,A2L + bge _lab7 + not A0H ; no - then make it +ve + not A0L + adds #1,A0 +_lab7: or A1H,A1H ; look at dividend + bge _lab8 + not A1H ; it is -ve, make it positive + not A1L + adds #1,A1 +_lab8: rts + +; A0=A0/A1 signed + + .global ___divhi3 +___divhi3: + bsr divnorm + bsr ___udivhi3 +negans: btst #3,A2L ; should answer be negative ? + beq _lab4 + not A0H ; yes, so make it so + not A0L + adds #1,A0 +_lab4: rts + +; A0=A0%A1 signed + + .global ___modhi3 +___modhi3: + bsr modnorm + bsr ___udivhi3 + mov A3,A0 + bra negans + +; A0=A0%A1 unsigned + + .global ___umodhi3 +___umodhi3: + bsr ___udivhi3 + mov A3,A0 + rts + +; A0=A0/A1 unsigned +; A3=A0%A1 unsigned +; A2H trashed +; D high 8 bits of denom +; d low 8 bits of denom +; N high 8 bits of num +; n low 8 bits of num +; M high 8 bits of mod +; m low 8 bits of mod +; Q high 8 bits of quot +; q low 8 bits of quot +; P preserve + +; The H8/300 only has a 16/8 bit divide, so we look at the incoming and +; see how to partition up the expression. + + .global ___udivhi3 +___udivhi3: + ; A0 A1 A2 A3 + ; Nn Dd P + sub.w A3,A3 ; Nn Dd xP 00 + or A1H,A1H + bne divlongway + or A0H,A0H + beq _lab6 + +; we know that D == 0 and N is != 0 + mov.b A0H,A3L ; Nn Dd xP 0N + divxu A1L,A3 ; MQ + mov.b A3L,A0H ; Q +; dealt with N, do n +_lab6: mov.b A0L,A3L ; n + divxu A1L,A3 ; mq + mov.b A3L,A0L ; Qq + mov.b A3H,A3L ; m + mov.b #0x0,A3H ; Qq 0m + rts + +; D != 0 - which means the denominator is +; loop around to get the result. + +divlongway: + mov.b A0H,A3L ; Nn Dd xP 0N + mov.b #0x0,A0H ; high byte of answer has to be zero + mov.b #0x8,A2H ; 8 +div8: add.b A0L,A0L ; n*=2 + rotxl A3L ; Make remainder bigger + rotxl A3H + sub.w A1,A3 ; Q-=N + bhs setbit ; set a bit ? + add.w A1,A3 ; no : too far , Q+=N + + dec A2H + bne div8 ; next bit + rts + +setbit: inc A0L ; do insert bit + dec A2H + bne div8 ; next bit + rts + +#endif /* __H8300__ */ +#endif /* L_divhi3 */ + +#ifdef L_divsi3 + +;; 4 byte integer divides for the H8/300. +;; +;; We have one routine which does all the work and lots of +;; little ones which prepare the args and massage the sign. +;; We bunch all of this into one object file since there are several +;; "supporting routines". + + .section .text + .align 2 + +; Put abs SIs into r0/r1 and r2/r3, and leave a 1 in r6l with sign of rest. +; This function is here to keep branch displacements small. + +#ifdef __H8300__ + +divnorm: + mov.b A0H,A0H ; is the numerator -ve + stc ccr,S2L ; keep the sign in bit 3 of S2L + bge postive + + ; negate arg + not A0H + not A1H + not A0L + not A1L + + add #1,A1L + addx #0,A1H + addx #0,A0L + addx #0,A0H +postive: + mov.b A2H,A2H ; is the denominator -ve + bge postive2 + not A2L + not A2H + not A3L + not A3H + add.b #1,A3L + addx #0,A3H + addx #0,A2L + addx #0,A2H + xor.b #0x08,S2L ; toggle the result sign +postive2: + rts + +;; Basically the same, except that the sign of the divisor determines +;; the sign. +modnorm: + mov.b A0H,A0H ; is the numerator -ve + stc ccr,S2L ; keep the sign in bit 3 of S2L + bge mpostive + + ; negate arg + not A0H + not A1H + not A0L + not A1L + + add #1,A1L + addx #0,A1H + addx #0,A0L + addx #0,A0H +mpostive: + mov.b A2H,A2H ; is the denominator -ve + bge mpostive2 + not A2L + not A2H + not A3L + not A3H + add.b #1,A3L + addx #0,A3H + addx #0,A2L + addx #0,A2H +mpostive2: + rts + +#else /* __H8300H__ */ + +divnorm: + mov.l A0P,A0P ; is the numerator -ve + stc ccr,S2L ; keep the sign in bit 3 of S2L + bge postive + + neg.l A0P ; negate arg + +postive: + mov.l A1P,A1P ; is the denominator -ve + bge postive2 + + neg.l A1P ; negate arg + xor.b #0x08,S2L ; toggle the result sign + +postive2: + rts + +;; Basically the same, except that the sign of the divisor determines +;; the sign. +modnorm: + mov.l A0P,A0P ; is the numerator -ve + stc ccr,S2L ; keep the sign in bit 3 of S2L + bge mpostive + + neg.l A0P ; negate arg + +mpostive: + mov.l A1P,A1P ; is the denominator -ve + bge mpostive2 + + neg.l A1P ; negate arg + +mpostive2: + rts + +#endif + +; numerator in A0/A1 +; denominator in A2/A3 + .global ___modsi3 +___modsi3: +#ifdef __H8300__ + PUSHP S2P + PUSHP S0P + PUSHP S1P + bsr modnorm + bsr divmodsi4 + mov S0,A0 + mov S1,A1 + bra exitdiv +#else + PUSHP S2P + bsr modnorm + bsr ___udivsi3 + mov.l er3,er0 + bra exitdiv +#endif + + ;; H8/300H and H8S version of ___udivsi3 is defined later in + ;; the file. +#ifdef __H8300__ + .global ___udivsi3 +___udivsi3: + PUSHP S2P + PUSHP S0P + PUSHP S1P + bsr divmodsi4 + bra reti +#endif + + .global ___umodsi3 +___umodsi3: +#ifdef __H8300__ + PUSHP S2P + PUSHP S0P + PUSHP S1P + bsr divmodsi4 + mov S0,A0 + mov S1,A1 + bra reti +#else + bsr ___udivsi3 + mov.l er3,er0 + rts +#endif + + .global ___divsi3 +___divsi3: +#ifdef __H8300__ + PUSHP S2P + PUSHP S0P + PUSHP S1P + jsr divnorm + jsr divmodsi4 +#else + PUSHP S2P + jsr divnorm + bsr ___udivsi3 +#endif + + ; examine what the sign should be +exitdiv: + btst #3,S2L + beq reti + + ; should be -ve +#ifdef __H8300__ + not A0H + not A1H + not A0L + not A1L + + add #1,A1L + addx #0,A1H + addx #0,A0L + addx #0,A0H +#else /* __H8300H__ */ + neg.l A0P +#endif + +reti: +#ifdef __H8300__ + POPP S1P + POPP S0P +#endif + POPP S2P + rts + + ; takes A0/A1 numerator (A0P for H8/300H) + ; A2/A3 denominator (A1P for H8/300H) + ; returns A0/A1 quotient (A0P for H8/300H) + ; S0/S1 remainder (S0P for H8/300H) + ; trashes S2H + +#ifdef __H8300__ + +divmodsi4: + sub.w S0,S0 ; zero play area + mov.w S0,S1 + mov.b A2H,S2H + or A2L,S2H + or A3H,S2H + bne DenHighNonZero + mov.b A0H,A0H + bne NumByte0Zero + mov.b A0L,A0L + bne NumByte1Zero + mov.b A1H,A1H + bne NumByte2Zero + bra NumByte3Zero +NumByte0Zero: + mov.b A0H,S1L + divxu A3L,S1 + mov.b S1L,A0H +NumByte1Zero: + mov.b A0L,S1L + divxu A3L,S1 + mov.b S1L,A0L +NumByte2Zero: + mov.b A1H,S1L + divxu A3L,S1 + mov.b S1L,A1H +NumByte3Zero: + mov.b A1L,S1L + divxu A3L,S1 + mov.b S1L,A1L + + mov.b S1H,S1L + mov.b #0x0,S1H + rts + +; have to do the divide by shift and test +DenHighNonZero: + mov.b A0H,S1L + mov.b A0L,A0H + mov.b A1H,A0L + mov.b A1L,A1H + + mov.b #0,A1L + mov.b #24,S2H ; only do 24 iterations + +nextbit: + add.w A1,A1 ; double the answer guess + rotxl A0L + rotxl A0H + + rotxl S1L ; double remainder + rotxl S1H + rotxl S0L + rotxl S0H + sub.w A3,S1 ; does it all fit + subx A2L,S0L + subx A2H,S0H + bhs setone + + add.w A3,S1 ; no, restore mistake + addx A2L,S0L + addx A2H,S0H + + dec S2H + bne nextbit + rts + +setone: + inc A1L + dec S2H + bne nextbit + rts + +#else /* __H8300H__ */ + + ;; This function also computes the remainder and stores it in er3. + .global ___udivsi3 +___udivsi3: + mov.w A1E,A1E ; denominator top word 0? + bne DenHighNonZero + + ; do it the easy way, see page 107 in manual + mov.w A0E,A2 + extu.l A2P + divxu.w A1,A2P + mov.w A2E,A0E + divxu.w A1,A0P + mov.w A0E,A3 + mov.w A2,A0E + extu.l A3P + rts + + ; er0 = er0 / er1 + ; er3 = er0 % er1 + ; trashes er1 er2 + ; expects er1 >= 2^16 +DenHighNonZero: + mov.l er0,er3 + mov.l er1,er2 +#ifdef __H8300H__ +divmod_L21: + shlr.l er0 + shlr.l er2 ; make divisor < 2^16 + mov.w e2,e2 + bne divmod_L21 +#else + shlr.l #2,er2 ; make divisor < 2^16 + mov.w e2,e2 + beq divmod_L22A +divmod_L21: + shlr.l #2,er0 +divmod_L22: + shlr.l #2,er2 ; make divisor < 2^16 + mov.w e2,e2 + bne divmod_L21 +divmod_L22A: + rotxl.w r2 + bcs divmod_L23 + shlr.l er0 + bra divmod_L24 +divmod_L23: + rotxr.w r2 + shlr.l #2,er0 +divmod_L24: +#endif + ;; At this point, + ;; er0 contains shifted dividend + ;; er1 contains divisor + ;; er2 contains shifted divisor + ;; er3 contains dividend, later remainder + divxu.w r2,er0 ; r0 now contains the approximate quotient (AQ) + extu.l er0 + beq divmod_L25 + subs #1,er0 ; er0 = AQ - 1 + mov.w e1,r2 + mulxu.w r0,er2 ; er2 = upper (AQ - 1) * divisor + sub.w r2,e3 ; dividend - 65536 * er2 + mov.w r1,r2 + mulxu.w r0,er2 ; compute er3 = remainder (tentative) + sub.l er2,er3 ; er3 = dividend - (AQ - 1) * divisor +divmod_L25: + cmp.l er1,er3 ; is divisor < remainder? + blo divmod_L26 + adds #1,er0 + sub.l er1,er3 ; correct the remainder +divmod_L26: + rts + +#endif +#endif /* L_divsi3 */ + +#ifdef L_mulhi3 + +;; HImode multiply. +; The H8/300 only has an 8*8->16 multiply. +; The answer is the same as: +; +; product = (srca.l * srcb.l) + ((srca.h * srcb.l) + (srcb.h * srca.l)) * 256 +; (we can ignore A1.h * A0.h cause that will all off the top) +; A0 in +; A1 in +; A0 answer + +#ifdef __H8300__ + .section .text + .align 2 + .global ___mulhi3 +___mulhi3: + mov.b A1L,A2L ; A2l gets srcb.l + mulxu A0L,A2 ; A2 gets first sub product + + mov.b A0H,A3L ; prepare for + mulxu A1L,A3 ; second sub product + + add.b A3L,A2H ; sum first two terms + + mov.b A1H,A3L ; third sub product + mulxu A0L,A3 + + add.b A3L,A2H ; almost there + mov.w A2,A0 ; that is + rts + +#endif +#endif /* L_mulhi3 */ + +#ifdef L_mulsi3 + +;; SImode multiply. +;; +;; I think that shift and add may be sufficient for this. Using the +;; supplied 8x8->16 would need 10 ops of 14 cycles each + overhead. This way +;; the inner loop uses maybe 20 cycles + overhead, but terminates +;; quickly on small args. +;; +;; A0/A1 src_a +;; A2/A3 src_b +;; +;; while (a) +;; { +;; if (a & 1) +;; r += b; +;; a >>= 1; +;; b <<= 1; +;; } + + .section .text + .align 2 + +#ifdef __H8300__ + + .global ___mulsi3 +___mulsi3: + PUSHP S0P + PUSHP S1P + + sub.w S0,S0 + sub.w S1,S1 + + ; while (a) +_top: mov.w A0,A0 + bne _more + mov.w A1,A1 + beq _done +_more: ; if (a & 1) + bld #0,A1L + bcc _nobit + ; r += b + add.w A3,S1 + addx A2L,S0L + addx A2H,S0H +_nobit: + ; a >>= 1 + shlr A0H + rotxr A0L + rotxr A1H + rotxr A1L + + ; b <<= 1 + add.w A3,A3 + addx A2L,A2L + addx A2H,A2H + bra _top + +_done: + mov.w S0,A0 + mov.w S1,A1 + POPP S1P + POPP S0P + rts + +#else /* __H8300H__ */ + +; +; mulsi3 for H8/300H - based on Renesas SH implementation +; +; by Toshiyasu Morita +; +; Old code: +; +; 16b * 16b = 372 states (worst case) +; 32b * 32b = 724 states (worst case) +; +; New code: +; +; 16b * 16b = 48 states +; 16b * 32b = 72 states +; 32b * 32b = 92 states +; + + .global ___mulsi3 +___mulsi3: + mov.w r1,r2 ; ( 2 states) b * d + mulxu r0,er2 ; (22 states) + + mov.w e0,r3 ; ( 2 states) a * d + beq L_skip1 ; ( 4 states) + mulxu r1,er3 ; (22 states) + add.w r3,e2 ; ( 2 states) + +L_skip1: + mov.w e1,r3 ; ( 2 states) c * b + beq L_skip2 ; ( 4 states) + mulxu r0,er3 ; (22 states) + add.w r3,e2 ; ( 2 states) + +L_skip2: + mov.l er2,er0 ; ( 2 states) + rts ; (10 states) + +#endif +#endif /* L_mulsi3 */ +#ifdef L_fixunssfsi_asm +/* For the h8300 we use asm to save some bytes, to + allow more programs to fit into the tiny address + space. For the H8/300H and H8S, the C version is good enough. */ +#ifdef __H8300__ +/* We still treat NANs different than libgcc2.c, but then, the + behavior is undefined anyways. */ + .global ___fixunssfsi +___fixunssfsi: + cmp.b #0x4f,r0h + bge Large_num + jmp @___fixsfsi +Large_num: + bhi L_huge_num + xor.b #0x80,A0L + bmi L_shift8 +L_huge_num: + mov.w #65535,A0 + mov.w A0,A1 + rts +L_shift8: + mov.b A0L,A0H + mov.b A1H,A0L + mov.b A1L,A1H + mov.b #0,A1L + rts +#endif +#endif /* L_fixunssfsi_asm */ diff --git a/libgcc/config/h8300/t-h8300 b/libgcc/config/h8300/t-h8300 new file mode 100644 index 00000000000..4602ff8b9ef --- /dev/null +++ b/libgcc/config/h8300/t-h8300 @@ -0,0 +1,3 @@ +LIB1ASMSRC = h8300/lib1funcs.S +LIB1ASMFUNCS = _cmpsi2 _ucmpsi2 _divhi3 _divsi3 _mulhi3 _mulsi3 \ + _fixunssfsi_asm diff --git a/libgcc/config/i386/cygwin.S b/libgcc/config/i386/cygwin.S new file mode 100644 index 00000000000..8f9c486850e --- /dev/null +++ b/libgcc/config/i386/cygwin.S @@ -0,0 +1,188 @@ +/* stuff needed for libgcc on win32. + * + * Copyright (C) 1996, 1998, 2001, 2003, 2008, 2009, 2010 + * Free Software Foundation, Inc. + * Written By Steve Chamberlain + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 3, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Under Section 7 of GPL version 3, you are granted additional + * permissions described in the GCC Runtime Library Exception, version + * 3.1, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License and + * a copy of the GCC Runtime Library Exception along with this program; + * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + * <http://www.gnu.org/licenses/>. + */ + +#include "auto-host.h" + +#ifdef HAVE_GAS_CFI_SECTIONS_DIRECTIVE + .cfi_sections .debug_frame +# define cfi_startproc() .cfi_startproc +# define cfi_endproc() .cfi_endproc +# define cfi_adjust_cfa_offset(X) .cfi_adjust_cfa_offset X +# define cfi_def_cfa_register(X) .cfi_def_cfa_register X +# define cfi_register(D,S) .cfi_register D, S +# ifdef _WIN64 +# define cfi_push(X) .cfi_adjust_cfa_offset 8; .cfi_rel_offset X, 0 +# define cfi_pop(X) .cfi_adjust_cfa_offset -8; .cfi_restore X +# else +# define cfi_push(X) .cfi_adjust_cfa_offset 4; .cfi_rel_offset X, 0 +# define cfi_pop(X) .cfi_adjust_cfa_offset -4; .cfi_restore X +# endif +#else +# define cfi_startproc() +# define cfi_endproc() +# define cfi_adjust_cfa_offset(X) +# define cfi_def_cfa_register(X) +# define cfi_register(D,S) +# define cfi_push(X) +# define cfi_pop(X) +#endif /* HAVE_GAS_CFI_SECTIONS_DIRECTIVE */ + +#ifdef L_chkstk +/* Function prologue calls __chkstk to probe the stack when allocating more + than CHECK_STACK_LIMIT bytes in one go. Touching the stack at 4K + increments is necessary to ensure that the guard pages used + by the OS virtual memory manger are allocated in correct sequence. */ + + .global ___chkstk + .global __alloca +#ifdef _WIN64 +/* __alloca is a normal function call, which uses %rcx as the argument. */ + cfi_startproc() +__alloca: + movq %rcx, %rax + /* FALLTHRU */ + +/* ___chkstk is a *special* function call, which uses %rax as the argument. + We avoid clobbering the 4 integer argument registers, %rcx, %rdx, + %r8 and %r9, which leaves us with %rax, %r10, and %r11 to use. */ + .align 4 +___chkstk: + popq %r11 /* pop return address */ + cfi_adjust_cfa_offset(-8) /* indicate return address in r11 */ + cfi_register(%rip, %r11) + movq %rsp, %r10 + cmpq $0x1000, %rax /* > 4k ?*/ + jb 2f + +1: subq $0x1000, %r10 /* yes, move pointer down 4k*/ + orl $0x0, (%r10) /* probe there */ + subq $0x1000, %rax /* decrement count */ + cmpq $0x1000, %rax + ja 1b /* and do it again */ + +2: subq %rax, %r10 + movq %rsp, %rax /* hold CFA until return */ + cfi_def_cfa_register(%rax) + orl $0x0, (%r10) /* less than 4k, just peek here */ + movq %r10, %rsp /* decrement stack */ + + /* Push the return value back. Doing this instead of just + jumping to %r11 preserves the cached call-return stack + used by most modern processors. */ + pushq %r11 + ret + cfi_endproc() +#else + cfi_startproc() +___chkstk: +__alloca: + pushl %ecx /* save temp */ + cfi_push(%eax) + leal 8(%esp), %ecx /* point past return addr */ + cmpl $0x1000, %eax /* > 4k ?*/ + jb 2f + +1: subl $0x1000, %ecx /* yes, move pointer down 4k*/ + orl $0x0, (%ecx) /* probe there */ + subl $0x1000, %eax /* decrement count */ + cmpl $0x1000, %eax + ja 1b /* and do it again */ + +2: subl %eax, %ecx + orl $0x0, (%ecx) /* less than 4k, just peek here */ + movl %esp, %eax /* save current stack pointer */ + cfi_def_cfa_register(%eax) + movl %ecx, %esp /* decrement stack */ + movl (%eax), %ecx /* recover saved temp */ + + /* Copy the return register. Doing this instead of just jumping to + the address preserves the cached call-return stack used by most + modern processors. */ + pushl 4(%eax) + ret + cfi_endproc() +#endif /* _WIN64 */ +#endif /* L_chkstk */ + +#ifdef L_chkstk_ms +/* ___chkstk_ms is a *special* function call, which uses %rax as the argument. + We avoid clobbering any registers. Unlike ___chkstk, it just probes the + stack and does no stack allocation. */ + .global ___chkstk_ms +#ifdef _WIN64 + cfi_startproc() +___chkstk_ms: + pushq %rcx /* save temps */ + cfi_push(%rcx) + pushq %rax + cfi_push(%rax) + cmpq $0x1000, %rax /* > 4k ?*/ + leaq 24(%rsp), %rcx /* point past return addr */ + jb 2f + +1: subq $0x1000, %rcx /* yes, move pointer down 4k */ + orq $0x0, (%rcx) /* probe there */ + subq $0x1000, %rax /* decrement count */ + cmpq $0x1000, %rax + ja 1b /* and do it again */ + +2: subq %rax, %rcx + orq $0x0, (%rcx) /* less than 4k, just peek here */ + + popq %rax + cfi_pop(%rax) + popq %rcx + cfi_pop(%rcx) + ret + cfi_endproc() +#else + cfi_startproc() +___chkstk_ms: + pushl %ecx /* save temp */ + cfi_push(%ecx) + pushl %eax + cfi_push(%eax) + cmpl $0x1000, %eax /* > 4k ?*/ + leal 12(%esp), %ecx /* point past return addr */ + jb 2f + +1: subl $0x1000, %ecx /* yes, move pointer down 4k*/ + orl $0x0, (%ecx) /* probe there */ + subl $0x1000, %eax /* decrement count */ + cmpl $0x1000, %eax + ja 1b /* and do it again */ + +2: subl %eax, %ecx + orl $0x0, (%ecx) /* less than 4k, just peek here */ + + popl %eax + cfi_pop(%eax) + popl %ecx + cfi_pop(%ecx) + ret + cfi_endproc() +#endif /* _WIN64 */ +#endif /* L_chkstk_ms */ diff --git a/libgcc/config/i386/t-chkstk b/libgcc/config/i386/t-chkstk new file mode 100644 index 00000000000..822981faab8 --- /dev/null +++ b/libgcc/config/i386/t-chkstk @@ -0,0 +1,2 @@ +LIB1ASMSRC = i386/cygwin.S +LIB1ASMFUNCS = _chkstk _chkstk_ms diff --git a/libgcc/config/ia64/__divxf3.asm b/libgcc/config/ia64/__divxf3.S index f741bdaf9bc..9cba8f59423 100644 --- a/libgcc/config/ia64/__divxf3.asm +++ b/libgcc/config/ia64/__divxf3.S @@ -3,7 +3,7 @@ #endif #define L__divxf3 -#include "config/ia64/lib1funcs.asm" +#include "config/ia64/lib1funcs.S" #ifdef SHARED #undef __divtf3 diff --git a/libgcc/config/ia64/_fixtfdi.asm b/libgcc/config/ia64/_fixtfdi.S index 4d13c808c51..863b70f7edc 100644 --- a/libgcc/config/ia64/_fixtfdi.asm +++ b/libgcc/config/ia64/_fixtfdi.S @@ -3,7 +3,7 @@ #endif #define L_fixtfdi -#include "config/ia64/lib1funcs.asm" +#include "config/ia64/lib1funcs.S" #ifdef SHARED #undef __fixtfti diff --git a/libgcc/config/ia64/_fixunstfdi.asm b/libgcc/config/ia64/_fixunstfdi.S index b722d9e90dc..aac6a284eaa 100644 --- a/libgcc/config/ia64/_fixunstfdi.asm +++ b/libgcc/config/ia64/_fixunstfdi.S @@ -3,7 +3,7 @@ #endif #define L_fixunstfdi -#include "config/ia64/lib1funcs.asm" +#include "config/ia64/lib1funcs.S" #ifdef SHARED #undef __fixunstfti diff --git a/libgcc/config/ia64/_floatditf.asm b/libgcc/config/ia64/_floatditf.S index 21d77028176..e37404d26d5 100644 --- a/libgcc/config/ia64/_floatditf.asm +++ b/libgcc/config/ia64/_floatditf.S @@ -3,7 +3,7 @@ #endif #define L_floatditf -#include "config/ia64/lib1funcs.asm" +#include "config/ia64/lib1funcs.S" #ifdef SHARED #undef __floattitf diff --git a/libgcc/config/ia64/lib1funcs.S b/libgcc/config/ia64/lib1funcs.S new file mode 100644 index 00000000000..b7eaa6eca3c --- /dev/null +++ b/libgcc/config/ia64/lib1funcs.S @@ -0,0 +1,795 @@ +/* Copyright (C) 2000, 2001, 2003, 2005, 2009 Free Software Foundation, Inc. + Contributed by James E. Wilson <wilson@cygnus.com>. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef L__divxf3 +// Compute a 80-bit IEEE double-extended quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// farg0 holds the dividend. farg1 holds the divisor. +// +// __divtf3 is an alternate symbol name for backward compatibility. + + .text + .align 16 + .global __divxf3 + .proc __divxf3 +__divxf3: +#ifdef SHARED + .global __divtf3 +__divtf3: +#endif + cmp.eq p7, p0 = r0, r0 + frcpa.s0 f10, p6 = farg0, farg1 + ;; +(p6) cmp.ne p7, p0 = r0, r0 + .pred.rel.mutex p6, p7 +(p6) fnma.s1 f11 = farg1, f10, f1 +(p6) fma.s1 f12 = farg0, f10, f0 + ;; +(p6) fma.s1 f13 = f11, f11, f0 +(p6) fma.s1 f14 = f11, f11, f11 + ;; +(p6) fma.s1 f11 = f13, f13, f11 +(p6) fma.s1 f13 = f14, f10, f10 + ;; +(p6) fma.s1 f10 = f13, f11, f10 +(p6) fnma.s1 f11 = farg1, f12, farg0 + ;; +(p6) fma.s1 f11 = f11, f10, f12 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f10 = f12, f10, f10 +(p6) fnma.s1 f12 = farg1, f11, farg0 + ;; +(p6) fma.s0 fret0 = f12, f10, f11 +(p7) mov fret0 = f10 + br.ret.sptk rp + .endp __divxf3 +#endif + +#ifdef L__divdf3 +// Compute a 64-bit IEEE double quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// farg0 holds the dividend. farg1 holds the divisor. + + .text + .align 16 + .global __divdf3 + .proc __divdf3 +__divdf3: + cmp.eq p7, p0 = r0, r0 + frcpa.s0 f10, p6 = farg0, farg1 + ;; +(p6) cmp.ne p7, p0 = r0, r0 + .pred.rel.mutex p6, p7 +(p6) fmpy.s1 f11 = farg0, f10 +(p6) fnma.s1 f12 = farg1, f10, f1 + ;; +(p6) fma.s1 f11 = f12, f11, f11 +(p6) fmpy.s1 f13 = f12, f12 + ;; +(p6) fma.s1 f10 = f12, f10, f10 +(p6) fma.s1 f11 = f13, f11, f11 + ;; +(p6) fmpy.s1 f12 = f13, f13 +(p6) fma.s1 f10 = f13, f10, f10 + ;; +(p6) fma.d.s1 f11 = f12, f11, f11 +(p6) fma.s1 f10 = f12, f10, f10 + ;; +(p6) fnma.d.s1 f8 = farg1, f11, farg0 + ;; +(p6) fma.d fret0 = f8, f10, f11 +(p7) mov fret0 = f10 + br.ret.sptk rp + ;; + .endp __divdf3 +#endif + +#ifdef L__divsf3 +// Compute a 32-bit IEEE float quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// farg0 holds the dividend. farg1 holds the divisor. + + .text + .align 16 + .global __divsf3 + .proc __divsf3 +__divsf3: + cmp.eq p7, p0 = r0, r0 + frcpa.s0 f10, p6 = farg0, farg1 + ;; +(p6) cmp.ne p7, p0 = r0, r0 + .pred.rel.mutex p6, p7 +(p6) fmpy.s1 f8 = farg0, f10 +(p6) fnma.s1 f9 = farg1, f10, f1 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fmpy.s1 f9 = f9, f9 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fmpy.s1 f9 = f9, f9 + ;; +(p6) fma.d.s1 f10 = f9, f8, f8 + ;; +(p6) fnorm.s.s0 fret0 = f10 +(p7) mov fret0 = f10 + br.ret.sptk rp + ;; + .endp __divsf3 +#endif + +#ifdef L__divdi3 +// Compute a 64-bit integer quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __divdi3 + .proc __divdi3 +__divdi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f8 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + // Convert the inputs to FP, so that they won't be treated as unsigned. + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 +(p7) break 1 + ;; + // Compute the reciprocal approximation. + frcpa.s1 f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fnma.s1 f11 = f9, f10, f1 +(p6) fmpy.s1 f12 = f8, f10 + ;; +(p6) fmpy.s1 f13 = f11, f11 +(p6) fma.s1 f12 = f11, f12, f12 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; +(p6) fma.s1 f10 = f12, f10, f11 + ;; + // Round quotient to an integer. + fcvt.fx.trunc.s1 f10 = f10 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __divdi3 +#endif + +#ifdef L__moddi3 +// Compute a 64-bit integer modulus. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend (a). in1 holds the divisor (b). + + .text + .align 16 + .global __moddi3 + .proc __moddi3 +__moddi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f14 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + // Convert the inputs to FP, so that they won't be treated as unsigned. + fcvt.xf f8 = f14 + fcvt.xf f9 = f9 +(p7) break 1 + ;; + // Compute the reciprocal approximation. + frcpa.s1 f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f11 = f9, f10, f1 + ;; +(p6) fma.s1 f12 = f11, f12, f12 +(p6) fmpy.s1 f13 = f11, f11 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; + sub in1 = r0, in1 +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f10 = f12, f10, f11 + ;; + fcvt.fx.trunc.s1 f10 = f10 + ;; + // r = q * (-b) + a + xma.l f10 = f10, f9, f14 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __moddi3 +#endif + +#ifdef L__udivdi3 +// Compute a 64-bit unsigned integer quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __udivdi3 + .proc __udivdi3 +__udivdi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f8 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + // Convert the inputs to FP, to avoid FP software-assist faults. + fcvt.xuf.s1 f8 = f8 + fcvt.xuf.s1 f9 = f9 +(p7) break 1 + ;; + // Compute the reciprocal approximation. + frcpa.s1 f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fnma.s1 f11 = f9, f10, f1 +(p6) fmpy.s1 f12 = f8, f10 + ;; +(p6) fmpy.s1 f13 = f11, f11 +(p6) fma.s1 f12 = f11, f12, f12 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; +(p6) fma.s1 f10 = f12, f10, f11 + ;; + // Round quotient to an unsigned integer. + fcvt.fxu.trunc.s1 f10 = f10 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __udivdi3 +#endif + +#ifdef L__umoddi3 +// Compute a 64-bit unsigned integer modulus. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend (a). in1 holds the divisor (b). + + .text + .align 16 + .global __umoddi3 + .proc __umoddi3 +__umoddi3: + .regstk 2,0,0,0 + // Transfer inputs to FP registers. + setf.sig f14 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + // Convert the inputs to FP, to avoid FP software assist faults. + fcvt.xuf.s1 f8 = f14 + fcvt.xuf.s1 f9 = f9 +(p7) break 1; + ;; + // Compute the reciprocal approximation. + frcpa.s1 f10, p6 = f8, f9 + ;; + // 3 Newton-Raphson iterations. +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f11 = f9, f10, f1 + ;; +(p6) fma.s1 f12 = f11, f12, f12 +(p6) fmpy.s1 f13 = f11, f11 + ;; +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 + ;; + sub in1 = r0, in1 +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f10 = f12, f10, f11 + ;; + // Round quotient to an unsigned integer. + fcvt.fxu.trunc.s1 f10 = f10 + ;; + // r = q * (-b) + a + xma.l f10 = f10, f9, f14 + ;; + // Transfer result to GP registers. + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __umoddi3 +#endif + +#ifdef L__divsi3 +// Compute a 32-bit integer quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __divsi3 + .proc __divsi3 +__divsi3: + .regstk 2,0,0,0 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + sxt4 in0 = in0 + sxt4 in1 = in1 + ;; + setf.sig f8 = in0 + setf.sig f9 = in1 +(p7) break 1 + ;; + mov r2 = 0x0ffdd + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 + ;; + setf.exp f11 = r2 + frcpa.s1 f10, p6 = f8, f9 + ;; +(p6) fmpy.s1 f8 = f8, f10 +(p6) fnma.s1 f9 = f9, f10, f1 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fma.s1 f9 = f9, f9, f11 + ;; +(p6) fma.s1 f10 = f9, f8, f8 + ;; + fcvt.fx.trunc.s1 f10 = f10 + ;; + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __divsi3 +#endif + +#ifdef L__modsi3 +// Compute a 32-bit integer modulus. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __modsi3 + .proc __modsi3 +__modsi3: + .regstk 2,0,0,0 + mov r2 = 0x0ffdd + sxt4 in0 = in0 + sxt4 in1 = in1 + ;; + setf.sig f13 = r32 + setf.sig f9 = r33 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + sub in1 = r0, in1 + fcvt.xf f8 = f13 + fcvt.xf f9 = f9 + ;; + setf.exp f11 = r2 + frcpa.s1 f10, p6 = f8, f9 +(p7) break 1 + ;; +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f10 = f9, f10, f1 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f12 = f10, f12, f12 +(p6) fma.s1 f10 = f10, f10, f11 + ;; +(p6) fma.s1 f10 = f10, f12, f12 + ;; + fcvt.fx.trunc.s1 f10 = f10 + ;; + xma.l f10 = f10, f9, f13 + ;; + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __modsi3 +#endif + +#ifdef L__udivsi3 +// Compute a 32-bit unsigned integer quotient. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __udivsi3 + .proc __udivsi3 +__udivsi3: + .regstk 2,0,0,0 + mov r2 = 0x0ffdd + zxt4 in0 = in0 + zxt4 in1 = in1 + ;; + setf.sig f8 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + fcvt.xf f8 = f8 + fcvt.xf f9 = f9 +(p7) break 1 + ;; + setf.exp f11 = r2 + frcpa.s1 f10, p6 = f8, f9 + ;; +(p6) fmpy.s1 f8 = f8, f10 +(p6) fnma.s1 f9 = f9, f10, f1 + ;; +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fma.s1 f9 = f9, f9, f11 + ;; +(p6) fma.s1 f10 = f9, f8, f8 + ;; + fcvt.fxu.trunc.s1 f10 = f10 + ;; + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __udivsi3 +#endif + +#ifdef L__umodsi3 +// Compute a 32-bit unsigned integer modulus. +// +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. +// +// in0 holds the dividend. in1 holds the divisor. + + .text + .align 16 + .global __umodsi3 + .proc __umodsi3 +__umodsi3: + .regstk 2,0,0,0 + mov r2 = 0x0ffdd + zxt4 in0 = in0 + zxt4 in1 = in1 + ;; + setf.sig f13 = in0 + setf.sig f9 = in1 + // Check divide by zero. + cmp.ne.unc p0,p7=0,in1 + ;; + sub in1 = r0, in1 + fcvt.xf f8 = f13 + fcvt.xf f9 = f9 + ;; + setf.exp f11 = r2 + frcpa.s1 f10, p6 = f8, f9 +(p7) break 1; + ;; +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f10 = f9, f10, f1 + ;; + setf.sig f9 = in1 +(p6) fma.s1 f12 = f10, f12, f12 +(p6) fma.s1 f10 = f10, f10, f11 + ;; +(p6) fma.s1 f10 = f10, f12, f12 + ;; + fcvt.fxu.trunc.s1 f10 = f10 + ;; + xma.l f10 = f10, f9, f13 + ;; + getf.sig ret0 = f10 + br.ret.sptk rp + ;; + .endp __umodsi3 +#endif + +#ifdef L__save_stack_nonlocal +// Notes on save/restore stack nonlocal: We read ar.bsp but write +// ar.bspstore. This is because ar.bsp can be read at all times +// (independent of the RSE mode) but since it's read-only we need to +// restore the value via ar.bspstore. This is OK because +// ar.bsp==ar.bspstore after executing "flushrs". + +// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) + + .text + .align 16 + .global __ia64_save_stack_nonlocal + .proc __ia64_save_stack_nonlocal +__ia64_save_stack_nonlocal: + { .mmf + alloc r18 = ar.pfs, 2, 0, 0, 0 + mov r19 = ar.rsc + ;; + } + { .mmi + flushrs + st8 [in0] = in1, 24 + and r19 = 0x1c, r19 + ;; + } + { .mmi + st8 [in0] = r18, -16 + mov ar.rsc = r19 + or r19 = 0x3, r19 + ;; + } + { .mmi + mov r16 = ar.bsp + mov r17 = ar.rnat + adds r2 = 8, in0 + ;; + } + { .mmi + st8 [in0] = r16 + st8 [r2] = r17 + } + { .mib + mov ar.rsc = r19 + br.ret.sptk.few rp + ;; + } + .endp __ia64_save_stack_nonlocal +#endif + +#ifdef L__nonlocal_goto +// void __ia64_nonlocal_goto(void *target_label, void *save_area, +// void *static_chain); + + .text + .align 16 + .global __ia64_nonlocal_goto + .proc __ia64_nonlocal_goto +__ia64_nonlocal_goto: + { .mmi + alloc r20 = ar.pfs, 3, 0, 0, 0 + ld8 r12 = [in1], 8 + mov.ret.sptk rp = in0, .L0 + ;; + } + { .mmf + ld8 r16 = [in1], 8 + mov r19 = ar.rsc + ;; + } + { .mmi + flushrs + ld8 r17 = [in1], 8 + and r19 = 0x1c, r19 + ;; + } + { .mmi + ld8 r18 = [in1] + mov ar.rsc = r19 + or r19 = 0x3, r19 + ;; + } + { .mmi + mov ar.bspstore = r16 + ;; + mov ar.rnat = r17 + ;; + } + { .mmi + loadrs + invala + mov r15 = in2 + ;; + } +.L0: { .mib + mov ar.rsc = r19 + mov ar.pfs = r18 + br.ret.sptk.few rp + ;; + } + .endp __ia64_nonlocal_goto +#endif + +#ifdef L__restore_stack_nonlocal +// This is mostly the same as nonlocal_goto above. +// ??? This has not been tested yet. + +// void __ia64_restore_stack_nonlocal(void *save_area) + + .text + .align 16 + .global __ia64_restore_stack_nonlocal + .proc __ia64_restore_stack_nonlocal +__ia64_restore_stack_nonlocal: + { .mmf + alloc r20 = ar.pfs, 4, 0, 0, 0 + ld8 r12 = [in0], 8 + ;; + } + { .mmb + ld8 r16=[in0], 8 + mov r19 = ar.rsc + ;; + } + { .mmi + flushrs + ld8 r17 = [in0], 8 + and r19 = 0x1c, r19 + ;; + } + { .mmf + ld8 r18 = [in0] + mov ar.rsc = r19 + ;; + } + { .mmi + mov ar.bspstore = r16 + ;; + mov ar.rnat = r17 + or r19 = 0x3, r19 + ;; + } + { .mmf + loadrs + invala + ;; + } +.L0: { .mib + mov ar.rsc = r19 + mov ar.pfs = r18 + br.ret.sptk.few rp + ;; + } + .endp __ia64_restore_stack_nonlocal +#endif + +#ifdef L__trampoline +// Implement the nested function trampoline. This is out of line +// so that we don't have to bother with flushing the icache, as +// well as making the on-stack trampoline smaller. +// +// The trampoline has the following form: +// +// +-------------------+ > +// TRAMP: | __ia64_trampoline | | +// +-------------------+ > fake function descriptor +// | TRAMP+16 | | +// +-------------------+ > +// | target descriptor | +// +-------------------+ +// | static link | +// +-------------------+ + + .text + .align 16 + .global __ia64_trampoline + .proc __ia64_trampoline +__ia64_trampoline: + { .mmi + ld8 r2 = [r1], 8 + ;; + ld8 r15 = [r1] + } + { .mmi + ld8 r3 = [r2], 8 + ;; + ld8 r1 = [r2] + mov b6 = r3 + } + { .bbb + br.sptk.many b6 + ;; + } + .endp __ia64_trampoline +#endif + +#ifdef SHARED +// Thunks for backward compatibility. +#ifdef L_fixtfdi + .text + .align 16 + .global __fixtfti + .proc __fixtfti +__fixtfti: + { .bbb + br.sptk.many __fixxfti + ;; + } + .endp __fixtfti +#endif +#ifdef L_fixunstfdi + .align 16 + .global __fixunstfti + .proc __fixunstfti +__fixunstfti: + { .bbb + br.sptk.many __fixunsxfti + ;; + } + .endp __fixunstfti +#endif +#ifdef L_floatditf + .align 16 + .global __floattitf + .proc __floattitf +__floattitf: + { .bbb + br.sptk.many __floattixf + ;; + } + .endp __floattitf +#endif +#endif diff --git a/libgcc/config/ia64/t-hpux b/libgcc/config/ia64/t-hpux index ef3387e7a61..1fee41385c0 100644 --- a/libgcc/config/ia64/t-hpux +++ b/libgcc/config/ia64/t-hpux @@ -1 +1,6 @@ +# On HP-UX we do not want _fixtfdi, _fixunstfdi, or _floatditf from +# LIB1ASMSRC. These functions map the 128 bit conversion function names +# to 80 bit conversions and were done for Linux backwards compatibility. +LIB1ASMFUNCS := $(filter-out _fixtfdi _fixunstfdi _floatditf,$(LIB1ASMFUNCS)) + LIB2ADDEH = $(srcdir)/unwind-c.c diff --git a/libgcc/config/ia64/t-ia64 b/libgcc/config/ia64/t-ia64 index 59cf3aa75f4..80445d8a2a8 100644 --- a/libgcc/config/ia64/t-ia64 +++ b/libgcc/config/ia64/t-ia64 @@ -1,3 +1,16 @@ +LIB1ASMSRC = ia64/lib1funcs.S + +# We use different names for the DImode div/mod files so that they won't +# conflict with libgcc2.c files. We used to use __ia64 as a prefix, now +# we use __ as the prefix. Note that L_divdi3 in libgcc2.c actually defines +# a TImode divide function, so there is no actual overlap here between +# libgcc2.c and lib1funcs.S. +LIB1ASMFUNCS = __divxf3 __divdf3 __divsf3 \ + __divdi3 __moddi3 __udivdi3 __umoddi3 \ + __divsi3 __modsi3 __udivsi3 __umodsi3 __save_stack_nonlocal \ + __nonlocal_goto __restore_stack_nonlocal __trampoline \ + _fixtfdi _fixunstfdi _floatditf + CUSTOM_CRTSTUFF = yes # Assemble startup files. diff --git a/libgcc/config/ia64/t-softfp-compat b/libgcc/config/ia64/t-softfp-compat index d3dad68c48f..00f45d51cd0 100644 --- a/libgcc/config/ia64/t-softfp-compat +++ b/libgcc/config/ia64/t-softfp-compat @@ -3,5 +3,5 @@ # Replace __dvxf3 _fixtfdi _fixunstfdi _floatditf libgcc1-tf-functions = __divxf3 _fixtfdi _fixunstfdi _floatditf LIB1ASMFUNCS := $(filter-out $(libgcc1-tf-functions), $(LIB1ASMFUNCS)) -libgcc1-tf-compats = $(addsuffix .asm, $(libgcc1-tf-functions)) +libgcc1-tf-compats = $(addsuffix .S, $(libgcc1-tf-functions)) LIB2ADD += $(addprefix $(srcdir)/config/ia64/, $(libgcc1-tf-compats)) diff --git a/libgcc/config/m32c/lib1funcs.S b/libgcc/config/m32c/lib1funcs.S new file mode 100644 index 00000000000..9b657787187 --- /dev/null +++ b/libgcc/config/m32c/lib1funcs.S @@ -0,0 +1,231 @@ +/* libgcc routines for R8C/M16C/M32C + Copyright (C) 2005, 2009, 2010 + Free Software Foundation, Inc. + Contributed by Red Hat. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#if defined(__r8c_cpu__) || defined(__m16c_cpu__) +#define A16 +#define A(n,w) n +#define W w +#else +#define A24 +#define A(n,w) w +#define W l +#endif + + +#ifdef L__m32c_memregs + +/* Warning: these memory locations are used as a register bank. They + *must* end up consecutive in any final executable, so you may *not* + use the otherwise obvious ".comm" directive to allocate space for + them. */ + + .bss + .global mem0 +mem0: .space 1 + .global mem1 +mem1: .space 1 + .global mem2 +mem2: .space 1 + .global mem3 +mem3: .space 1 + .global mem4 +mem4: .space 1 + .global mem5 +mem5: .space 1 + .global mem6 +mem6: .space 1 + .global mem7 +mem7: .space 1 + .global mem8 +mem8: .space 1 + .global mem9 +mem9: .space 1 + .global mem10 +mem10: .space 1 + .global mem11 +mem11: .space 1 + .global mem12 +mem12: .space 1 + .global mem13 +mem13: .space 1 + .global mem14 +mem14: .space 1 + .global mem15 +mem15: .space 1 + +#endif + +#ifdef L__m32c_eh_return + .text + .global __m32c_eh_return +__m32c_eh_return: + + /* At this point, r0 has the stack adjustment, r1r3 has the + address to return to. The stack looks like this: + + old_ra + old_fp + <- unwound sp + ... + fb + through + r0 + <- sp + + What we need to do is restore all the registers, update the + stack, and return to the right place. + */ + + stc sp,a0 + + add.W A(#16,#24),a0 + /* a0 points to the current stack, just above the register + save areas */ + + mov.w a0,a1 + exts.w r0 + sub.W A(r0,r2r0),a1 + sub.W A(#3,#4),a1 + /* a1 points to the new stack. */ + + /* This is for the "rts" below. */ + mov.w r1,[a1] +#ifdef A16 + mov.w r2,r1 + mov.b r1l,2[a1] +#else + mov.w r2,2[a1] +#endif + + /* This is for the "popc sp" below. */ + mov.W a1,[a0] + + popm r0,r1,r2,r3,a0,a1,sb,fb + popc sp + rts +#endif + +/* SImode arguments for SI foo(SI,SI) functions. */ +#ifdef A16 +#define SAL 5[fb] +#define SAH 7[fb] +#define SBL 9[fb] +#define SBH 11[fb] +#else +#define SAL 8[fb] +#define SAH 10[fb] +#define SBL 12[fb] +#define SBH 14[fb] +#endif + +#ifdef L__m32c_mulsi3 + .text + .global ___mulsi3 +___mulsi3: + enter #0 + push.w r2 + mov.w SAL,r0 + mulu.w SBL,r0 /* writes to r2r0 */ + mov.w r0,mem0 + mov.w r2,mem2 + mov.w SAL,r0 + mulu.w SBH,r0 /* writes to r2r0 */ + add.w r0,mem2 + mov.w SAH,r0 + mulu.w SBL,r0 /* writes to r2r0 */ + add.w r0,mem2 + pop.w r2 + exitd +#endif + +#ifdef L__m32c_cmpsi2 + .text + .global ___cmpsi2 +___cmpsi2: + enter #0 + cmp.w SBH,SAH + jgt cmpsi_gt + jlt cmpsi_lt + cmp.w SBL,SAL + jgt cmpsi_gt + jlt cmpsi_lt + mov.w #1,r0 + exitd +cmpsi_gt: + mov.w #2,r0 + exitd +cmpsi_lt: + mov.w #0,r0 + exitd +#endif + +#ifdef L__m32c_ucmpsi2 + .text + .global ___ucmpsi2 +___ucmpsi2: + enter #0 + cmp.w SBH,SAH + jgtu cmpsi_gt + jltu cmpsi_lt + cmp.w SBL,SAL + jgtu cmpsi_gt + jltu cmpsi_lt + mov.w #1,r0 + exitd +cmpsi_gt: + mov.w #2,r0 + exitd +cmpsi_lt: + mov.w #0,r0 + exitd +#endif + +#ifdef L__m32c_jsri16 + .text +#ifdef A16 + .global m32c_jsri16 +m32c_jsri16: + add.w #-1, sp + + /* Read the address (16 bits) and return address (24 bits) off + the stack. */ + mov.w 4[sp], r0 + mov.w 1[sp], r3 + mov.b 3[sp], a0 /* This zero-extends, so the high byte has + zero in it. */ + + /* Write the return address, then new address, to the stack. */ + mov.w a0, 1[sp] /* Just to get the zero in 2[sp]. */ + mov.w r0, 0[sp] + mov.w r3, 3[sp] + mov.b a0, 5[sp] + + /* This "returns" to the target address, leaving the pending + return address on the stack. */ + rts +#endif + +#endif diff --git a/libgcc/config/m32c/t-m32c b/libgcc/config/m32c/t-m32c new file mode 100644 index 00000000000..d21483750fd --- /dev/null +++ b/libgcc/config/m32c/t-m32c @@ -0,0 +1,9 @@ +LIB1ASMSRC = m32c/lib1funcs.S + +LIB1ASMFUNCS = \ + __m32c_memregs \ + __m32c_eh_return \ + __m32c_mulsi3 \ + __m32c_cmpsi2 \ + __m32c_ucmpsi2 \ + __m32c_jsri16 diff --git a/libgcc/config/m32r/initfini.c b/libgcc/config/m32r/initfini.c index 6e7d58614c7..56332459223 100644 --- a/libgcc/config/m32r/initfini.c +++ b/libgcc/config/m32r/initfini.c @@ -1,5 +1,5 @@ /* .init/.fini section handling + C++ global constructor/destructor handling. - This file is based on crtstuff.c, sol2-crti.asm, sol2-crtn.asm. + This file is based on crtstuff.c, sol2-crti.S, sol2-crtn.S. Copyright (C) 1996, 1997, 1998, 2006, 2009 Free Software Foundation, Inc. diff --git a/libgcc/config/m68k/lb1sf68.S b/libgcc/config/m68k/lb1sf68.S new file mode 100644 index 00000000000..0339a092c4f --- /dev/null +++ b/libgcc/config/m68k/lb1sf68.S @@ -0,0 +1,4116 @@ +/* libgcc routines for 68000 w/o floating-point hardware. + Copyright (C) 1994, 1996, 1997, 1998, 2008, 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +/* Use this one for any 680x0; assumes no floating point hardware. + The trailing " '" appearing on some lines is for ANSI preprocessors. Yuk. + Some of this code comes from MINIX, via the folks at ericsson. + D. V. Henkel-Wallace (gumby@cygnus.com) Fete Bastille, 1992 +*/ + +/* These are predefined by new versions of GNU cpp. */ + +#ifndef __USER_LABEL_PREFIX__ +#define __USER_LABEL_PREFIX__ _ +#endif + +#ifndef __REGISTER_PREFIX__ +#define __REGISTER_PREFIX__ +#endif + +#ifndef __IMMEDIATE_PREFIX__ +#define __IMMEDIATE_PREFIX__ # +#endif + +/* ANSI concatenation macros. */ + +#define CONCAT1(a, b) CONCAT2(a, b) +#define CONCAT2(a, b) a ## b + +/* Use the right prefix for global labels. */ + +#define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x) + +/* Note that X is a function. */ + +#ifdef __ELF__ +#define FUNC(x) .type SYM(x),function +#else +/* The .proc pseudo-op is accepted, but ignored, by GAS. We could just + define this to the empty string for non-ELF systems, but defining it + to .proc means that the information is available to the assembler if + the need arises. */ +#define FUNC(x) .proc +#endif + +/* Use the right prefix for registers. */ + +#define REG(x) CONCAT1 (__REGISTER_PREFIX__, x) + +/* Use the right prefix for immediate values. */ + +#define IMM(x) CONCAT1 (__IMMEDIATE_PREFIX__, x) + +#define d0 REG (d0) +#define d1 REG (d1) +#define d2 REG (d2) +#define d3 REG (d3) +#define d4 REG (d4) +#define d5 REG (d5) +#define d6 REG (d6) +#define d7 REG (d7) +#define a0 REG (a0) +#define a1 REG (a1) +#define a2 REG (a2) +#define a3 REG (a3) +#define a4 REG (a4) +#define a5 REG (a5) +#define a6 REG (a6) +#define fp REG (fp) +#define sp REG (sp) +#define pc REG (pc) + +/* Provide a few macros to allow for PIC code support. + * With PIC, data is stored A5 relative so we've got to take a bit of special + * care to ensure that all loads of global data is via A5. PIC also requires + * jumps and subroutine calls to be PC relative rather than absolute. We cheat + * a little on this and in the PIC case, we use short offset branches and + * hope that the final object code is within range (which it should be). + */ +#ifndef __PIC__ + + /* Non PIC (absolute/relocatable) versions */ + + .macro PICCALL addr + jbsr \addr + .endm + + .macro PICJUMP addr + jmp \addr + .endm + + .macro PICLEA sym, reg + lea \sym, \reg + .endm + + .macro PICPEA sym, areg + pea \sym + .endm + +#else /* __PIC__ */ + +# if defined (__uClinux__) + + /* Versions for uClinux */ + +# if defined(__ID_SHARED_LIBRARY__) + + /* -mid-shared-library versions */ + + .macro PICLEA sym, reg + movel a5@(_current_shared_library_a5_offset_), \reg + movel \sym@GOT(\reg), \reg + .endm + + .macro PICPEA sym, areg + movel a5@(_current_shared_library_a5_offset_), \areg + movel \sym@GOT(\areg), sp@- + .endm + + .macro PICCALL addr + PICLEA \addr,a0 + jsr a0@ + .endm + + .macro PICJUMP addr + PICLEA \addr,a0 + jmp a0@ + .endm + +# else /* !__ID_SHARED_LIBRARY__ */ + + /* Versions for -msep-data */ + + .macro PICLEA sym, reg + movel \sym@GOT(a5), \reg + .endm + + .macro PICPEA sym, areg + movel \sym@GOT(a5), sp@- + .endm + + .macro PICCALL addr +#if defined (__mcoldfire__) && !defined (__mcfisab__) && !defined (__mcfisac__) + lea \addr-.-8,a0 + jsr pc@(a0) +#else + jbsr \addr +#endif + .endm + + .macro PICJUMP addr + /* ISA C has no bra.l instruction, and since this assembly file + gets assembled into multiple object files, we avoid the + bra instruction entirely. */ +#if defined (__mcoldfire__) && !defined (__mcfisab__) + lea \addr-.-8,a0 + jmp pc@(a0) +#else + bra \addr +#endif + .endm + +# endif + +# else /* !__uClinux__ */ + + /* Versions for Linux */ + + .macro PICLEA sym, reg + movel #_GLOBAL_OFFSET_TABLE_@GOTPC, \reg + lea (-6, pc, \reg), \reg + movel \sym@GOT(\reg), \reg + .endm + + .macro PICPEA sym, areg + movel #_GLOBAL_OFFSET_TABLE_@GOTPC, \areg + lea (-6, pc, \areg), \areg + movel \sym@GOT(\areg), sp@- + .endm + + .macro PICCALL addr +#if defined (__mcoldfire__) && !defined (__mcfisab__) && !defined (__mcfisac__) + lea \addr-.-8,a0 + jsr pc@(a0) +#else + jbsr \addr +#endif + .endm + + .macro PICJUMP addr + /* ISA C has no bra.l instruction, and since this assembly file + gets assembled into multiple object files, we avoid the + bra instruction entirely. */ +#if defined (__mcoldfire__) && !defined (__mcfisab__) + lea \addr-.-8,a0 + jmp pc@(a0) +#else + bra \addr +#endif + .endm + +# endif +#endif /* __PIC__ */ + + +#ifdef L_floatex + +| This is an attempt at a decent floating point (single, double and +| extended double) code for the GNU C compiler. It should be easy to +| adapt to other compilers (but beware of the local labels!). + +| Starting date: 21 October, 1990 + +| It is convenient to introduce the notation (s,e,f) for a floating point +| number, where s=sign, e=exponent, f=fraction. We will call a floating +| point number fpn to abbreviate, independently of the precision. +| Let MAX_EXP be in each case the maximum exponent (255 for floats, 1023 +| for doubles and 16383 for long doubles). We then have the following +| different cases: +| 1. Normalized fpns have 0 < e < MAX_EXP. They correspond to +| (-1)^s x 1.f x 2^(e-bias-1). +| 2. Denormalized fpns have e=0. They correspond to numbers of the form +| (-1)^s x 0.f x 2^(-bias). +| 3. +/-INFINITY have e=MAX_EXP, f=0. +| 4. Quiet NaN (Not a Number) have all bits set. +| 5. Signaling NaN (Not a Number) have s=0, e=MAX_EXP, f=1. + +|============================================================================= +| exceptions +|============================================================================= + +| This is the floating point condition code register (_fpCCR): +| +| struct { +| short _exception_bits; +| short _trap_enable_bits; +| short _sticky_bits; +| short _rounding_mode; +| short _format; +| short _last_operation; +| union { +| float sf; +| double df; +| } _operand1; +| union { +| float sf; +| double df; +| } _operand2; +| } _fpCCR; + + .data + .even + + .globl SYM (_fpCCR) + +SYM (_fpCCR): +__exception_bits: + .word 0 +__trap_enable_bits: + .word 0 +__sticky_bits: + .word 0 +__rounding_mode: + .word ROUND_TO_NEAREST +__format: + .word NIL +__last_operation: + .word NOOP +__operand1: + .long 0 + .long 0 +__operand2: + .long 0 + .long 0 + +| Offsets: +EBITS = __exception_bits - SYM (_fpCCR) +TRAPE = __trap_enable_bits - SYM (_fpCCR) +STICK = __sticky_bits - SYM (_fpCCR) +ROUND = __rounding_mode - SYM (_fpCCR) +FORMT = __format - SYM (_fpCCR) +LASTO = __last_operation - SYM (_fpCCR) +OPER1 = __operand1 - SYM (_fpCCR) +OPER2 = __operand2 - SYM (_fpCCR) + +| The following exception types are supported: +INEXACT_RESULT = 0x0001 +UNDERFLOW = 0x0002 +OVERFLOW = 0x0004 +DIVIDE_BY_ZERO = 0x0008 +INVALID_OPERATION = 0x0010 + +| The allowed rounding modes are: +UNKNOWN = -1 +ROUND_TO_NEAREST = 0 | round result to nearest representable value +ROUND_TO_ZERO = 1 | round result towards zero +ROUND_TO_PLUS = 2 | round result towards plus infinity +ROUND_TO_MINUS = 3 | round result towards minus infinity + +| The allowed values of format are: +NIL = 0 +SINGLE_FLOAT = 1 +DOUBLE_FLOAT = 2 +LONG_FLOAT = 3 + +| The allowed values for the last operation are: +NOOP = 0 +ADD = 1 +MULTIPLY = 2 +DIVIDE = 3 +NEGATE = 4 +COMPARE = 5 +EXTENDSFDF = 6 +TRUNCDFSF = 7 + +|============================================================================= +| __clear_sticky_bits +|============================================================================= + +| The sticky bits are normally not cleared (thus the name), whereas the +| exception type and exception value reflect the last computation. +| This routine is provided to clear them (you can also write to _fpCCR, +| since it is globally visible). + + .globl SYM (__clear_sticky_bit) + + .text + .even + +| void __clear_sticky_bits(void); +SYM (__clear_sticky_bit): + PICLEA SYM (_fpCCR),a0 +#ifndef __mcoldfire__ + movew IMM (0),a0@(STICK) +#else + clr.w a0@(STICK) +#endif + rts + +|============================================================================= +| $_exception_handler +|============================================================================= + + .globl $_exception_handler + + .text + .even + +| This is the common exit point if an exception occurs. +| NOTE: it is NOT callable from C! +| It expects the exception type in d7, the format (SINGLE_FLOAT, +| DOUBLE_FLOAT or LONG_FLOAT) in d6, and the last operation code in d5. +| It sets the corresponding exception and sticky bits, and the format. +| Depending on the format if fills the corresponding slots for the +| operands which produced the exception (all this information is provided +| so if you write your own exception handlers you have enough information +| to deal with the problem). +| Then checks to see if the corresponding exception is trap-enabled, +| in which case it pushes the address of _fpCCR and traps through +| trap FPTRAP (15 for the moment). + +FPTRAP = 15 + +$_exception_handler: + PICLEA SYM (_fpCCR),a0 + movew d7,a0@(EBITS) | set __exception_bits +#ifndef __mcoldfire__ + orw d7,a0@(STICK) | and __sticky_bits +#else + movew a0@(STICK),d4 + orl d7,d4 + movew d4,a0@(STICK) +#endif + movew d6,a0@(FORMT) | and __format + movew d5,a0@(LASTO) | and __last_operation + +| Now put the operands in place: +#ifndef __mcoldfire__ + cmpw IMM (SINGLE_FLOAT),d6 +#else + cmpl IMM (SINGLE_FLOAT),d6 +#endif + beq 1f + movel a6@(8),a0@(OPER1) + movel a6@(12),a0@(OPER1+4) + movel a6@(16),a0@(OPER2) + movel a6@(20),a0@(OPER2+4) + bra 2f +1: movel a6@(8),a0@(OPER1) + movel a6@(12),a0@(OPER2) +2: +| And check whether the exception is trap-enabled: +#ifndef __mcoldfire__ + andw a0@(TRAPE),d7 | is exception trap-enabled? +#else + clrl d6 + movew a0@(TRAPE),d6 + andl d6,d7 +#endif + beq 1f | no, exit + PICPEA SYM (_fpCCR),a1 | yes, push address of _fpCCR + trap IMM (FPTRAP) | and trap +#ifndef __mcoldfire__ +1: moveml sp@+,d2-d7 | restore data registers +#else +1: moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 | and return + rts +#endif /* L_floatex */ + +#ifdef L_mulsi3 + .text + FUNC(__mulsi3) + .globl SYM (__mulsi3) +SYM (__mulsi3): + movew sp@(4), d0 /* x0 -> d0 */ + muluw sp@(10), d0 /* x0*y1 */ + movew sp@(6), d1 /* x1 -> d1 */ + muluw sp@(8), d1 /* x1*y0 */ +#ifndef __mcoldfire__ + addw d1, d0 +#else + addl d1, d0 +#endif + swap d0 + clrw d0 + movew sp@(6), d1 /* x1 -> d1 */ + muluw sp@(10), d1 /* x1*y1 */ + addl d1, d0 + + rts +#endif /* L_mulsi3 */ + +#ifdef L_udivsi3 + .text + FUNC(__udivsi3) + .globl SYM (__udivsi3) +SYM (__udivsi3): +#ifndef __mcoldfire__ + movel d2, sp@- + movel sp@(12), d1 /* d1 = divisor */ + movel sp@(8), d0 /* d0 = dividend */ + + cmpl IMM (0x10000), d1 /* divisor >= 2 ^ 16 ? */ + jcc L3 /* then try next algorithm */ + movel d0, d2 + clrw d2 + swap d2 + divu d1, d2 /* high quotient in lower word */ + movew d2, d0 /* save high quotient */ + swap d0 + movew sp@(10), d2 /* get low dividend + high rest */ + divu d1, d2 /* low quotient */ + movew d2, d0 + jra L6 + +L3: movel d1, d2 /* use d2 as divisor backup */ +L4: lsrl IMM (1), d1 /* shift divisor */ + lsrl IMM (1), d0 /* shift dividend */ + cmpl IMM (0x10000), d1 /* still divisor >= 2 ^ 16 ? */ + jcc L4 + divu d1, d0 /* now we have 16-bit divisor */ + andl IMM (0xffff), d0 /* mask out divisor, ignore remainder */ + +/* Multiply the 16-bit tentative quotient with the 32-bit divisor. Because of + the operand ranges, this might give a 33-bit product. If this product is + greater than the dividend, the tentative quotient was too large. */ + movel d2, d1 + mulu d0, d1 /* low part, 32 bits */ + swap d2 + mulu d0, d2 /* high part, at most 17 bits */ + swap d2 /* align high part with low part */ + tstw d2 /* high part 17 bits? */ + jne L5 /* if 17 bits, quotient was too large */ + addl d2, d1 /* add parts */ + jcs L5 /* if sum is 33 bits, quotient was too large */ + cmpl sp@(8), d1 /* compare the sum with the dividend */ + jls L6 /* if sum > dividend, quotient was too large */ +L5: subql IMM (1), d0 /* adjust quotient */ + +L6: movel sp@+, d2 + rts + +#else /* __mcoldfire__ */ + +/* ColdFire implementation of non-restoring division algorithm from + Hennessy & Patterson, Appendix A. */ + link a6,IMM (-12) + moveml d2-d4,sp@ + movel a6@(8),d0 + movel a6@(12),d1 + clrl d2 | clear p + moveq IMM (31),d4 +L1: addl d0,d0 | shift reg pair (p,a) one bit left + addxl d2,d2 + movl d2,d3 | subtract b from p, store in tmp. + subl d1,d3 + jcs L2 | if no carry, + bset IMM (0),d0 | set the low order bit of a to 1, + movl d3,d2 | and store tmp in p. +L2: subql IMM (1),d4 + jcc L1 + moveml sp@,d2-d4 | restore data registers + unlk a6 | and return + rts +#endif /* __mcoldfire__ */ + +#endif /* L_udivsi3 */ + +#ifdef L_divsi3 + .text + FUNC(__divsi3) + .globl SYM (__divsi3) +SYM (__divsi3): + movel d2, sp@- + + moveq IMM (1), d2 /* sign of result stored in d2 (=1 or =-1) */ + movel sp@(12), d1 /* d1 = divisor */ + jpl L1 + negl d1 +#ifndef __mcoldfire__ + negb d2 /* change sign because divisor <0 */ +#else + negl d2 /* change sign because divisor <0 */ +#endif +L1: movel sp@(8), d0 /* d0 = dividend */ + jpl L2 + negl d0 +#ifndef __mcoldfire__ + negb d2 +#else + negl d2 +#endif + +L2: movel d1, sp@- + movel d0, sp@- + PICCALL SYM (__udivsi3) /* divide abs(dividend) by abs(divisor) */ + addql IMM (8), sp + + tstb d2 + jpl L3 + negl d0 + +L3: movel sp@+, d2 + rts +#endif /* L_divsi3 */ + +#ifdef L_umodsi3 + .text + FUNC(__umodsi3) + .globl SYM (__umodsi3) +SYM (__umodsi3): + movel sp@(8), d1 /* d1 = divisor */ + movel sp@(4), d0 /* d0 = dividend */ + movel d1, sp@- + movel d0, sp@- + PICCALL SYM (__udivsi3) + addql IMM (8), sp + movel sp@(8), d1 /* d1 = divisor */ +#ifndef __mcoldfire__ + movel d1, sp@- + movel d0, sp@- + PICCALL SYM (__mulsi3) /* d0 = (a/b)*b */ + addql IMM (8), sp +#else + mulsl d1,d0 +#endif + movel sp@(4), d1 /* d1 = dividend */ + subl d0, d1 /* d1 = a - (a/b)*b */ + movel d1, d0 + rts +#endif /* L_umodsi3 */ + +#ifdef L_modsi3 + .text + FUNC(__modsi3) + .globl SYM (__modsi3) +SYM (__modsi3): + movel sp@(8), d1 /* d1 = divisor */ + movel sp@(4), d0 /* d0 = dividend */ + movel d1, sp@- + movel d0, sp@- + PICCALL SYM (__divsi3) + addql IMM (8), sp + movel sp@(8), d1 /* d1 = divisor */ +#ifndef __mcoldfire__ + movel d1, sp@- + movel d0, sp@- + PICCALL SYM (__mulsi3) /* d0 = (a/b)*b */ + addql IMM (8), sp +#else + mulsl d1,d0 +#endif + movel sp@(4), d1 /* d1 = dividend */ + subl d0, d1 /* d1 = a - (a/b)*b */ + movel d1, d0 + rts +#endif /* L_modsi3 */ + + +#ifdef L_double + + .globl SYM (_fpCCR) + .globl $_exception_handler + +QUIET_NaN = 0xffffffff + +D_MAX_EXP = 0x07ff +D_BIAS = 1022 +DBL_MAX_EXP = D_MAX_EXP - D_BIAS +DBL_MIN_EXP = 1 - D_BIAS +DBL_MANT_DIG = 53 + +INEXACT_RESULT = 0x0001 +UNDERFLOW = 0x0002 +OVERFLOW = 0x0004 +DIVIDE_BY_ZERO = 0x0008 +INVALID_OPERATION = 0x0010 + +DOUBLE_FLOAT = 2 + +NOOP = 0 +ADD = 1 +MULTIPLY = 2 +DIVIDE = 3 +NEGATE = 4 +COMPARE = 5 +EXTENDSFDF = 6 +TRUNCDFSF = 7 + +UNKNOWN = -1 +ROUND_TO_NEAREST = 0 | round result to nearest representable value +ROUND_TO_ZERO = 1 | round result towards zero +ROUND_TO_PLUS = 2 | round result towards plus infinity +ROUND_TO_MINUS = 3 | round result towards minus infinity + +| Entry points: + + .globl SYM (__adddf3) + .globl SYM (__subdf3) + .globl SYM (__muldf3) + .globl SYM (__divdf3) + .globl SYM (__negdf2) + .globl SYM (__cmpdf2) + .globl SYM (__cmpdf2_internal) + .hidden SYM (__cmpdf2_internal) + + .text + .even + +| These are common routines to return and signal exceptions. + +Ld$den: +| Return and signal a denormalized number + orl d7,d0 + movew IMM (INEXACT_RESULT+UNDERFLOW),d7 + moveq IMM (DOUBLE_FLOAT),d6 + PICJUMP $_exception_handler + +Ld$infty: +Ld$overflow: +| Return a properly signed INFINITY and set the exception flags + movel IMM (0x7ff00000),d0 + movel IMM (0),d1 + orl d7,d0 + movew IMM (INEXACT_RESULT+OVERFLOW),d7 + moveq IMM (DOUBLE_FLOAT),d6 + PICJUMP $_exception_handler + +Ld$underflow: +| Return 0 and set the exception flags + movel IMM (0),d0 + movel d0,d1 + movew IMM (INEXACT_RESULT+UNDERFLOW),d7 + moveq IMM (DOUBLE_FLOAT),d6 + PICJUMP $_exception_handler + +Ld$inop: +| Return a quiet NaN and set the exception flags + movel IMM (QUIET_NaN),d0 + movel d0,d1 + movew IMM (INEXACT_RESULT+INVALID_OPERATION),d7 + moveq IMM (DOUBLE_FLOAT),d6 + PICJUMP $_exception_handler + +Ld$div$0: +| Return a properly signed INFINITY and set the exception flags + movel IMM (0x7ff00000),d0 + movel IMM (0),d1 + orl d7,d0 + movew IMM (INEXACT_RESULT+DIVIDE_BY_ZERO),d7 + moveq IMM (DOUBLE_FLOAT),d6 + PICJUMP $_exception_handler + +|============================================================================= +|============================================================================= +| double precision routines +|============================================================================= +|============================================================================= + +| A double precision floating point number (double) has the format: +| +| struct _double { +| unsigned int sign : 1; /* sign bit */ +| unsigned int exponent : 11; /* exponent, shifted by 126 */ +| unsigned int fraction : 52; /* fraction */ +| } double; +| +| Thus sizeof(double) = 8 (64 bits). +| +| All the routines are callable from C programs, and return the result +| in the register pair d0-d1. They also preserve all registers except +| d0-d1 and a0-a1. + +|============================================================================= +| __subdf3 +|============================================================================= + +| double __subdf3(double, double); + FUNC(__subdf3) +SYM (__subdf3): + bchg IMM (31),sp@(12) | change sign of second operand + | and fall through, so we always add +|============================================================================= +| __adddf3 +|============================================================================= + +| double __adddf3(double, double); + FUNC(__adddf3) +SYM (__adddf3): +#ifndef __mcoldfire__ + link a6,IMM (0) | everything will be done in registers + moveml d2-d7,sp@- | save all data registers and a2 (but d0-d1) +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + movel a6@(8),d0 | get first operand + movel a6@(12),d1 | + movel a6@(16),d2 | get second operand + movel a6@(20),d3 | + + movel d0,d7 | get d0's sign bit in d7 ' + addl d1,d1 | check and clear sign bit of a, and gain one + addxl d0,d0 | bit of extra precision + beq Ladddf$b | if zero return second operand + + movel d2,d6 | save sign in d6 + addl d3,d3 | get rid of sign bit and gain one bit of + addxl d2,d2 | extra precision + beq Ladddf$a | if zero return first operand + + andl IMM (0x80000000),d7 | isolate a's sign bit ' + swap d6 | and also b's sign bit ' +#ifndef __mcoldfire__ + andw IMM (0x8000),d6 | + orw d6,d7 | and combine them into d7, so that a's sign ' + | bit is in the high word and b's is in the ' + | low word, so d6 is free to be used +#else + andl IMM (0x8000),d6 + orl d6,d7 +#endif + movel d7,a0 | now save d7 into a0, so d7 is free to + | be used also + +| Get the exponents and check for denormalized and/or infinity. + + movel IMM (0x001fffff),d6 | mask for the fraction + movel IMM (0x00200000),d7 | mask to put hidden bit back + + movel d0,d4 | + andl d6,d0 | get fraction in d0 + notl d6 | make d6 into mask for the exponent + andl d6,d4 | get exponent in d4 + beq Ladddf$a$den | branch if a is denormalized + cmpl d6,d4 | check for INFINITY or NaN + beq Ladddf$nf | + orl d7,d0 | and put hidden bit back +Ladddf$1: + swap d4 | shift right exponent so that it starts +#ifndef __mcoldfire__ + lsrw IMM (5),d4 | in bit 0 and not bit 20 +#else + lsrl IMM (5),d4 | in bit 0 and not bit 20 +#endif +| Now we have a's exponent in d4 and fraction in d0-d1 ' + movel d2,d5 | save b to get exponent + andl d6,d5 | get exponent in d5 + beq Ladddf$b$den | branch if b is denormalized + cmpl d6,d5 | check for INFINITY or NaN + beq Ladddf$nf + notl d6 | make d6 into mask for the fraction again + andl d6,d2 | and get fraction in d2 + orl d7,d2 | and put hidden bit back +Ladddf$2: + swap d5 | shift right exponent so that it starts +#ifndef __mcoldfire__ + lsrw IMM (5),d5 | in bit 0 and not bit 20 +#else + lsrl IMM (5),d5 | in bit 0 and not bit 20 +#endif + +| Now we have b's exponent in d5 and fraction in d2-d3. ' + +| The situation now is as follows: the signs are combined in a0, the +| numbers are in d0-d1 (a) and d2-d3 (b), and the exponents in d4 (a) +| and d5 (b). To do the rounding correctly we need to keep all the +| bits until the end, so we need to use d0-d1-d2-d3 for the first number +| and d4-d5-d6-d7 for the second. To do this we store (temporarily) the +| exponents in a2-a3. + +#ifndef __mcoldfire__ + moveml a2-a3,sp@- | save the address registers +#else + movel a2,sp@- + movel a3,sp@- + movel a4,sp@- +#endif + + movel d4,a2 | save the exponents + movel d5,a3 | + + movel IMM (0),d7 | and move the numbers around + movel d7,d6 | + movel d3,d5 | + movel d2,d4 | + movel d7,d3 | + movel d7,d2 | + +| Here we shift the numbers until the exponents are the same, and put +| the largest exponent in a2. +#ifndef __mcoldfire__ + exg d4,a2 | get exponents back + exg d5,a3 | + cmpw d4,d5 | compare the exponents +#else + movel d4,a4 | get exponents back + movel a2,d4 + movel a4,a2 + movel d5,a4 + movel a3,d5 + movel a4,a3 + cmpl d4,d5 | compare the exponents +#endif + beq Ladddf$3 | if equal don't shift ' + bhi 9f | branch if second exponent is higher + +| Here we have a's exponent larger than b's, so we have to shift b. We do +| this by using as counter d2: +1: movew d4,d2 | move largest exponent to d2 +#ifndef __mcoldfire__ + subw d5,d2 | and subtract second exponent + exg d4,a2 | get back the longs we saved + exg d5,a3 | +#else + subl d5,d2 | and subtract second exponent + movel d4,a4 | get back the longs we saved + movel a2,d4 + movel a4,a2 + movel d5,a4 + movel a3,d5 + movel a4,a3 +#endif +| if difference is too large we don't shift (actually, we can just exit) ' +#ifndef __mcoldfire__ + cmpw IMM (DBL_MANT_DIG+2),d2 +#else + cmpl IMM (DBL_MANT_DIG+2),d2 +#endif + bge Ladddf$b$small +#ifndef __mcoldfire__ + cmpw IMM (32),d2 | if difference >= 32, shift by longs +#else + cmpl IMM (32),d2 | if difference >= 32, shift by longs +#endif + bge 5f +2: +#ifndef __mcoldfire__ + cmpw IMM (16),d2 | if difference >= 16, shift by words +#else + cmpl IMM (16),d2 | if difference >= 16, shift by words +#endif + bge 6f + bra 3f | enter dbra loop + +4: +#ifndef __mcoldfire__ + lsrl IMM (1),d4 + roxrl IMM (1),d5 + roxrl IMM (1),d6 + roxrl IMM (1),d7 +#else + lsrl IMM (1),d7 + btst IMM (0),d6 + beq 10f + bset IMM (31),d7 +10: lsrl IMM (1),d6 + btst IMM (0),d5 + beq 11f + bset IMM (31),d6 +11: lsrl IMM (1),d5 + btst IMM (0),d4 + beq 12f + bset IMM (31),d5 +12: lsrl IMM (1),d4 +#endif +3: +#ifndef __mcoldfire__ + dbra d2,4b +#else + subql IMM (1),d2 + bpl 4b +#endif + movel IMM (0),d2 + movel d2,d3 + bra Ladddf$4 +5: + movel d6,d7 + movel d5,d6 + movel d4,d5 + movel IMM (0),d4 +#ifndef __mcoldfire__ + subw IMM (32),d2 +#else + subl IMM (32),d2 +#endif + bra 2b +6: + movew d6,d7 + swap d7 + movew d5,d6 + swap d6 + movew d4,d5 + swap d5 + movew IMM (0),d4 + swap d4 +#ifndef __mcoldfire__ + subw IMM (16),d2 +#else + subl IMM (16),d2 +#endif + bra 3b + +9: +#ifndef __mcoldfire__ + exg d4,d5 + movew d4,d6 + subw d5,d6 | keep d5 (largest exponent) in d4 + exg d4,a2 + exg d5,a3 +#else + movel d5,d6 + movel d4,d5 + movel d6,d4 + subl d5,d6 + movel d4,a4 + movel a2,d4 + movel a4,a2 + movel d5,a4 + movel a3,d5 + movel a4,a3 +#endif +| if difference is too large we don't shift (actually, we can just exit) ' +#ifndef __mcoldfire__ + cmpw IMM (DBL_MANT_DIG+2),d6 +#else + cmpl IMM (DBL_MANT_DIG+2),d6 +#endif + bge Ladddf$a$small +#ifndef __mcoldfire__ + cmpw IMM (32),d6 | if difference >= 32, shift by longs +#else + cmpl IMM (32),d6 | if difference >= 32, shift by longs +#endif + bge 5f +2: +#ifndef __mcoldfire__ + cmpw IMM (16),d6 | if difference >= 16, shift by words +#else + cmpl IMM (16),d6 | if difference >= 16, shift by words +#endif + bge 6f + bra 3f | enter dbra loop + +4: +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 + roxrl IMM (1),d2 + roxrl IMM (1),d3 +#else + lsrl IMM (1),d3 + btst IMM (0),d2 + beq 10f + bset IMM (31),d3 +10: lsrl IMM (1),d2 + btst IMM (0),d1 + beq 11f + bset IMM (31),d2 +11: lsrl IMM (1),d1 + btst IMM (0),d0 + beq 12f + bset IMM (31),d1 +12: lsrl IMM (1),d0 +#endif +3: +#ifndef __mcoldfire__ + dbra d6,4b +#else + subql IMM (1),d6 + bpl 4b +#endif + movel IMM (0),d7 + movel d7,d6 + bra Ladddf$4 +5: + movel d2,d3 + movel d1,d2 + movel d0,d1 + movel IMM (0),d0 +#ifndef __mcoldfire__ + subw IMM (32),d6 +#else + subl IMM (32),d6 +#endif + bra 2b +6: + movew d2,d3 + swap d3 + movew d1,d2 + swap d2 + movew d0,d1 + swap d1 + movew IMM (0),d0 + swap d0 +#ifndef __mcoldfire__ + subw IMM (16),d6 +#else + subl IMM (16),d6 +#endif + bra 3b +Ladddf$3: +#ifndef __mcoldfire__ + exg d4,a2 + exg d5,a3 +#else + movel d4,a4 + movel a2,d4 + movel a4,a2 + movel d5,a4 + movel a3,d5 + movel a4,a3 +#endif +Ladddf$4: +| Now we have the numbers in d0--d3 and d4--d7, the exponent in a2, and +| the signs in a4. + +| Here we have to decide whether to add or subtract the numbers: +#ifndef __mcoldfire__ + exg d7,a0 | get the signs + exg d6,a3 | a3 is free to be used +#else + movel d7,a4 + movel a0,d7 + movel a4,a0 + movel d6,a4 + movel a3,d6 + movel a4,a3 +#endif + movel d7,d6 | + movew IMM (0),d7 | get a's sign in d7 ' + swap d6 | + movew IMM (0),d6 | and b's sign in d6 ' + eorl d7,d6 | compare the signs + bmi Lsubdf$0 | if the signs are different we have + | to subtract +#ifndef __mcoldfire__ + exg d7,a0 | else we add the numbers + exg d6,a3 | +#else + movel d7,a4 + movel a0,d7 + movel a4,a0 + movel d6,a4 + movel a3,d6 + movel a4,a3 +#endif + addl d7,d3 | + addxl d6,d2 | + addxl d5,d1 | + addxl d4,d0 | + + movel a2,d4 | return exponent to d4 + movel a0,d7 | + andl IMM (0x80000000),d7 | d7 now has the sign + +#ifndef __mcoldfire__ + moveml sp@+,a2-a3 +#else + movel sp@+,a4 + movel sp@+,a3 + movel sp@+,a2 +#endif + +| Before rounding normalize so bit #DBL_MANT_DIG is set (we will consider +| the case of denormalized numbers in the rounding routine itself). +| As in the addition (not in the subtraction!) we could have set +| one more bit we check this: + btst IMM (DBL_MANT_DIG+1),d0 + beq 1f +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 + roxrl IMM (1),d2 + roxrl IMM (1),d3 + addw IMM (1),d4 +#else + lsrl IMM (1),d3 + btst IMM (0),d2 + beq 10f + bset IMM (31),d3 +10: lsrl IMM (1),d2 + btst IMM (0),d1 + beq 11f + bset IMM (31),d2 +11: lsrl IMM (1),d1 + btst IMM (0),d0 + beq 12f + bset IMM (31),d1 +12: lsrl IMM (1),d0 + addl IMM (1),d4 +#endif +1: + lea pc@(Ladddf$5),a0 | to return from rounding routine + PICLEA SYM (_fpCCR),a1 | check the rounding mode +#ifdef __mcoldfire__ + clrl d6 +#endif + movew a1@(6),d6 | rounding mode in d6 + beq Lround$to$nearest +#ifndef __mcoldfire__ + cmpw IMM (ROUND_TO_PLUS),d6 +#else + cmpl IMM (ROUND_TO_PLUS),d6 +#endif + bhi Lround$to$minus + blt Lround$to$zero + bra Lround$to$plus +Ladddf$5: +| Put back the exponent and check for overflow +#ifndef __mcoldfire__ + cmpw IMM (0x7ff),d4 | is the exponent big? +#else + cmpl IMM (0x7ff),d4 | is the exponent big? +#endif + bge 1f + bclr IMM (DBL_MANT_DIG-1),d0 +#ifndef __mcoldfire__ + lslw IMM (4),d4 | put exponent back into position +#else + lsll IMM (4),d4 | put exponent back into position +#endif + swap d0 | +#ifndef __mcoldfire__ + orw d4,d0 | +#else + orl d4,d0 | +#endif + swap d0 | + bra Ladddf$ret +1: + moveq IMM (ADD),d5 + bra Ld$overflow + +Lsubdf$0: +| Here we do the subtraction. +#ifndef __mcoldfire__ + exg d7,a0 | put sign back in a0 + exg d6,a3 | +#else + movel d7,a4 + movel a0,d7 + movel a4,a0 + movel d6,a4 + movel a3,d6 + movel a4,a3 +#endif + subl d7,d3 | + subxl d6,d2 | + subxl d5,d1 | + subxl d4,d0 | + beq Ladddf$ret$1 | if zero just exit + bpl 1f | if positive skip the following + movel a0,d7 | + bchg IMM (31),d7 | change sign bit in d7 + movel d7,a0 | + negl d3 | + negxl d2 | + negxl d1 | and negate result + negxl d0 | +1: + movel a2,d4 | return exponent to d4 + movel a0,d7 + andl IMM (0x80000000),d7 | isolate sign bit +#ifndef __mcoldfire__ + moveml sp@+,a2-a3 | +#else + movel sp@+,a4 + movel sp@+,a3 + movel sp@+,a2 +#endif + +| Before rounding normalize so bit #DBL_MANT_DIG is set (we will consider +| the case of denormalized numbers in the rounding routine itself). +| As in the addition (not in the subtraction!) we could have set +| one more bit we check this: + btst IMM (DBL_MANT_DIG+1),d0 + beq 1f +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 + roxrl IMM (1),d2 + roxrl IMM (1),d3 + addw IMM (1),d4 +#else + lsrl IMM (1),d3 + btst IMM (0),d2 + beq 10f + bset IMM (31),d3 +10: lsrl IMM (1),d2 + btst IMM (0),d1 + beq 11f + bset IMM (31),d2 +11: lsrl IMM (1),d1 + btst IMM (0),d0 + beq 12f + bset IMM (31),d1 +12: lsrl IMM (1),d0 + addl IMM (1),d4 +#endif +1: + lea pc@(Lsubdf$1),a0 | to return from rounding routine + PICLEA SYM (_fpCCR),a1 | check the rounding mode +#ifdef __mcoldfire__ + clrl d6 +#endif + movew a1@(6),d6 | rounding mode in d6 + beq Lround$to$nearest +#ifndef __mcoldfire__ + cmpw IMM (ROUND_TO_PLUS),d6 +#else + cmpl IMM (ROUND_TO_PLUS),d6 +#endif + bhi Lround$to$minus + blt Lround$to$zero + bra Lround$to$plus +Lsubdf$1: +| Put back the exponent and sign (we don't have overflow). ' + bclr IMM (DBL_MANT_DIG-1),d0 +#ifndef __mcoldfire__ + lslw IMM (4),d4 | put exponent back into position +#else + lsll IMM (4),d4 | put exponent back into position +#endif + swap d0 | +#ifndef __mcoldfire__ + orw d4,d0 | +#else + orl d4,d0 | +#endif + swap d0 | + bra Ladddf$ret + +| If one of the numbers was too small (difference of exponents >= +| DBL_MANT_DIG+1) we return the other (and now we don't have to ' +| check for finiteness or zero). +Ladddf$a$small: +#ifndef __mcoldfire__ + moveml sp@+,a2-a3 +#else + movel sp@+,a4 + movel sp@+,a3 + movel sp@+,a2 +#endif + movel a6@(16),d0 + movel a6@(20),d1 + PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | restore data registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 | and return + rts + +Ladddf$b$small: +#ifndef __mcoldfire__ + moveml sp@+,a2-a3 +#else + movel sp@+,a4 + movel sp@+,a3 + movel sp@+,a2 +#endif + movel a6@(8),d0 + movel a6@(12),d1 + PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | restore data registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 | and return + rts + +Ladddf$a$den: + movel d7,d4 | d7 contains 0x00200000 + bra Ladddf$1 + +Ladddf$b$den: + movel d7,d5 | d7 contains 0x00200000 + notl d6 + bra Ladddf$2 + +Ladddf$b: +| Return b (if a is zero) + movel d2,d0 + movel d3,d1 + bne 1f | Check if b is -0 + cmpl IMM (0x80000000),d0 + bne 1f + andl IMM (0x80000000),d7 | Use the sign of a + clrl d0 + bra Ladddf$ret +Ladddf$a: + movel a6@(8),d0 + movel a6@(12),d1 +1: + moveq IMM (ADD),d5 +| Check for NaN and +/-INFINITY. + movel d0,d7 | + andl IMM (0x80000000),d7 | + bclr IMM (31),d0 | + cmpl IMM (0x7ff00000),d0 | + bge 2f | + movel d0,d0 | check for zero, since we don't ' + bne Ladddf$ret | want to return -0 by mistake + bclr IMM (31),d7 | + bra Ladddf$ret | +2: + andl IMM (0x000fffff),d0 | check for NaN (nonzero fraction) + orl d1,d0 | + bne Ld$inop | + bra Ld$infty | + +Ladddf$ret$1: +#ifndef __mcoldfire__ + moveml sp@+,a2-a3 | restore regs and exit +#else + movel sp@+,a4 + movel sp@+,a3 + movel sp@+,a2 +#endif + +Ladddf$ret: +| Normal exit. + PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ + orl d7,d0 | put sign bit back +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts + +Ladddf$ret$den: +| Return a denormalized number. +#ifndef __mcoldfire__ + lsrl IMM (1),d0 | shift right once more + roxrl IMM (1),d1 | +#else + lsrl IMM (1),d1 + btst IMM (0),d0 + beq 10f + bset IMM (31),d1 +10: lsrl IMM (1),d0 +#endif + bra Ladddf$ret + +Ladddf$nf: + moveq IMM (ADD),d5 +| This could be faster but it is not worth the effort, since it is not +| executed very often. We sacrifice speed for clarity here. + movel a6@(8),d0 | get the numbers back (remember that we + movel a6@(12),d1 | did some processing already) + movel a6@(16),d2 | + movel a6@(20),d3 | + movel IMM (0x7ff00000),d4 | useful constant (INFINITY) + movel d0,d7 | save sign bits + movel d2,d6 | + bclr IMM (31),d0 | clear sign bits + bclr IMM (31),d2 | +| We know that one of them is either NaN of +/-INFINITY +| Check for NaN (if either one is NaN return NaN) + cmpl d4,d0 | check first a (d0) + bhi Ld$inop | if d0 > 0x7ff00000 or equal and + bne 2f + tstl d1 | d1 > 0, a is NaN + bne Ld$inop | +2: cmpl d4,d2 | check now b (d1) + bhi Ld$inop | + bne 3f + tstl d3 | + bne Ld$inop | +3: +| Now comes the check for +/-INFINITY. We know that both are (maybe not +| finite) numbers, but we have to check if both are infinite whether we +| are adding or subtracting them. + eorl d7,d6 | to check sign bits + bmi 1f + andl IMM (0x80000000),d7 | get (common) sign bit + bra Ld$infty +1: +| We know one (or both) are infinite, so we test for equality between the +| two numbers (if they are equal they have to be infinite both, so we +| return NaN). + cmpl d2,d0 | are both infinite? + bne 1f | if d0 <> d2 they are not equal + cmpl d3,d1 | if d0 == d2 test d3 and d1 + beq Ld$inop | if equal return NaN +1: + andl IMM (0x80000000),d7 | get a's sign bit ' + cmpl d4,d0 | test now for infinity + beq Ld$infty | if a is INFINITY return with this sign + bchg IMM (31),d7 | else we know b is INFINITY and has + bra Ld$infty | the opposite sign + +|============================================================================= +| __muldf3 +|============================================================================= + +| double __muldf3(double, double); + FUNC(__muldf3) +SYM (__muldf3): +#ifndef __mcoldfire__ + link a6,IMM (0) + moveml d2-d7,sp@- +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + movel a6@(8),d0 | get a into d0-d1 + movel a6@(12),d1 | + movel a6@(16),d2 | and b into d2-d3 + movel a6@(20),d3 | + movel d0,d7 | d7 will hold the sign of the product + eorl d2,d7 | + andl IMM (0x80000000),d7 | + movel d7,a0 | save sign bit into a0 + movel IMM (0x7ff00000),d7 | useful constant (+INFINITY) + movel d7,d6 | another (mask for fraction) + notl d6 | + bclr IMM (31),d0 | get rid of a's sign bit ' + movel d0,d4 | + orl d1,d4 | + beq Lmuldf$a$0 | branch if a is zero + movel d0,d4 | + bclr IMM (31),d2 | get rid of b's sign bit ' + movel d2,d5 | + orl d3,d5 | + beq Lmuldf$b$0 | branch if b is zero + movel d2,d5 | + cmpl d7,d0 | is a big? + bhi Lmuldf$inop | if a is NaN return NaN + beq Lmuldf$a$nf | we still have to check d1 and b ... + cmpl d7,d2 | now compare b with INFINITY + bhi Lmuldf$inop | is b NaN? + beq Lmuldf$b$nf | we still have to check d3 ... +| Here we have both numbers finite and nonzero (and with no sign bit). +| Now we get the exponents into d4 and d5. + andl d7,d4 | isolate exponent in d4 + beq Lmuldf$a$den | if exponent zero, have denormalized + andl d6,d0 | isolate fraction + orl IMM (0x00100000),d0 | and put hidden bit back + swap d4 | I like exponents in the first byte +#ifndef __mcoldfire__ + lsrw IMM (4),d4 | +#else + lsrl IMM (4),d4 | +#endif +Lmuldf$1: + andl d7,d5 | + beq Lmuldf$b$den | + andl d6,d2 | + orl IMM (0x00100000),d2 | and put hidden bit back + swap d5 | +#ifndef __mcoldfire__ + lsrw IMM (4),d5 | +#else + lsrl IMM (4),d5 | +#endif +Lmuldf$2: | +#ifndef __mcoldfire__ + addw d5,d4 | add exponents + subw IMM (D_BIAS+1),d4 | and subtract bias (plus one) +#else + addl d5,d4 | add exponents + subl IMM (D_BIAS+1),d4 | and subtract bias (plus one) +#endif + +| We are now ready to do the multiplication. The situation is as follows: +| both a and b have bit 52 ( bit 20 of d0 and d2) set (even if they were +| denormalized to start with!), which means that in the product bit 104 +| (which will correspond to bit 8 of the fourth long) is set. + +| Here we have to do the product. +| To do it we have to juggle the registers back and forth, as there are not +| enough to keep everything in them. So we use the address registers to keep +| some intermediate data. + +#ifndef __mcoldfire__ + moveml a2-a3,sp@- | save a2 and a3 for temporary use +#else + movel a2,sp@- + movel a3,sp@- + movel a4,sp@- +#endif + movel IMM (0),a2 | a2 is a null register + movel d4,a3 | and a3 will preserve the exponent + +| First, shift d2-d3 so bit 20 becomes bit 31: +#ifndef __mcoldfire__ + rorl IMM (5),d2 | rotate d2 5 places right + swap d2 | and swap it + rorl IMM (5),d3 | do the same thing with d3 + swap d3 | + movew d3,d6 | get the rightmost 11 bits of d3 + andw IMM (0x07ff),d6 | + orw d6,d2 | and put them into d2 + andw IMM (0xf800),d3 | clear those bits in d3 +#else + moveq IMM (11),d7 | left shift d2 11 bits + lsll d7,d2 + movel d3,d6 | get a copy of d3 + lsll d7,d3 | left shift d3 11 bits + andl IMM (0xffe00000),d6 | get the top 11 bits of d3 + moveq IMM (21),d7 | right shift them 21 bits + lsrl d7,d6 + orl d6,d2 | stick them at the end of d2 +#endif + + movel d2,d6 | move b into d6-d7 + movel d3,d7 | move a into d4-d5 + movel d0,d4 | and clear d0-d1-d2-d3 (to put result) + movel d1,d5 | + movel IMM (0),d3 | + movel d3,d2 | + movel d3,d1 | + movel d3,d0 | + +| We use a1 as counter: + movel IMM (DBL_MANT_DIG-1),a1 +#ifndef __mcoldfire__ + exg d7,a1 +#else + movel d7,a4 + movel a1,d7 + movel a4,a1 +#endif + +1: +#ifndef __mcoldfire__ + exg d7,a1 | put counter back in a1 +#else + movel d7,a4 + movel a1,d7 + movel a4,a1 +#endif + addl d3,d3 | shift sum once left + addxl d2,d2 | + addxl d1,d1 | + addxl d0,d0 | + addl d7,d7 | + addxl d6,d6 | + bcc 2f | if bit clear skip the following +#ifndef __mcoldfire__ + exg d7,a2 | +#else + movel d7,a4 + movel a2,d7 + movel a4,a2 +#endif + addl d5,d3 | else add a to the sum + addxl d4,d2 | + addxl d7,d1 | + addxl d7,d0 | +#ifndef __mcoldfire__ + exg d7,a2 | +#else + movel d7,a4 + movel a2,d7 + movel a4,a2 +#endif +2: +#ifndef __mcoldfire__ + exg d7,a1 | put counter in d7 + dbf d7,1b | decrement and branch +#else + movel d7,a4 + movel a1,d7 + movel a4,a1 + subql IMM (1),d7 + bpl 1b +#endif + + movel a3,d4 | restore exponent +#ifndef __mcoldfire__ + moveml sp@+,a2-a3 +#else + movel sp@+,a4 + movel sp@+,a3 + movel sp@+,a2 +#endif + +| Now we have the product in d0-d1-d2-d3, with bit 8 of d0 set. The +| first thing to do now is to normalize it so bit 8 becomes bit +| DBL_MANT_DIG-32 (to do the rounding); later we will shift right. + swap d0 + swap d1 + movew d1,d0 + swap d2 + movew d2,d1 + swap d3 + movew d3,d2 + movew IMM (0),d3 +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 + roxrl IMM (1),d2 + roxrl IMM (1),d3 + lsrl IMM (1),d0 + roxrl IMM (1),d1 + roxrl IMM (1),d2 + roxrl IMM (1),d3 + lsrl IMM (1),d0 + roxrl IMM (1),d1 + roxrl IMM (1),d2 + roxrl IMM (1),d3 +#else + moveq IMM (29),d6 + lsrl IMM (3),d3 + movel d2,d7 + lsll d6,d7 + orl d7,d3 + lsrl IMM (3),d2 + movel d1,d7 + lsll d6,d7 + orl d7,d2 + lsrl IMM (3),d1 + movel d0,d7 + lsll d6,d7 + orl d7,d1 + lsrl IMM (3),d0 +#endif + +| Now round, check for over- and underflow, and exit. + movel a0,d7 | get sign bit back into d7 + moveq IMM (MULTIPLY),d5 + + btst IMM (DBL_MANT_DIG+1-32),d0 + beq Lround$exit +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 + addw IMM (1),d4 +#else + lsrl IMM (1),d1 + btst IMM (0),d0 + beq 10f + bset IMM (31),d1 +10: lsrl IMM (1),d0 + addl IMM (1),d4 +#endif + bra Lround$exit + +Lmuldf$inop: + moveq IMM (MULTIPLY),d5 + bra Ld$inop + +Lmuldf$b$nf: + moveq IMM (MULTIPLY),d5 + movel a0,d7 | get sign bit back into d7 + tstl d3 | we know d2 == 0x7ff00000, so check d3 + bne Ld$inop | if d3 <> 0 b is NaN + bra Ld$overflow | else we have overflow (since a is finite) + +Lmuldf$a$nf: + moveq IMM (MULTIPLY),d5 + movel a0,d7 | get sign bit back into d7 + tstl d1 | we know d0 == 0x7ff00000, so check d1 + bne Ld$inop | if d1 <> 0 a is NaN + bra Ld$overflow | else signal overflow + +| If either number is zero return zero, unless the other is +/-INFINITY or +| NaN, in which case we return NaN. +Lmuldf$b$0: + moveq IMM (MULTIPLY),d5 +#ifndef __mcoldfire__ + exg d2,d0 | put b (==0) into d0-d1 + exg d3,d1 | and a (with sign bit cleared) into d2-d3 + movel a0,d0 | set result sign +#else + movel d0,d2 | put a into d2-d3 + movel d1,d3 + movel a0,d0 | put result zero into d0-d1 + movq IMM(0),d1 +#endif + bra 1f +Lmuldf$a$0: + movel a0,d0 | set result sign + movel a6@(16),d2 | put b into d2-d3 again + movel a6@(20),d3 | + bclr IMM (31),d2 | clear sign bit +1: cmpl IMM (0x7ff00000),d2 | check for non-finiteness + bge Ld$inop | in case NaN or +/-INFINITY return NaN + PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts + +| If a number is denormalized we put an exponent of 1 but do not put the +| hidden bit back into the fraction; instead we shift left until bit 21 +| (the hidden bit) is set, adjusting the exponent accordingly. We do this +| to ensure that the product of the fractions is close to 1. +Lmuldf$a$den: + movel IMM (1),d4 + andl d6,d0 +1: addl d1,d1 | shift a left until bit 20 is set + addxl d0,d0 | +#ifndef __mcoldfire__ + subw IMM (1),d4 | and adjust exponent +#else + subl IMM (1),d4 | and adjust exponent +#endif + btst IMM (20),d0 | + bne Lmuldf$1 | + bra 1b + +Lmuldf$b$den: + movel IMM (1),d5 + andl d6,d2 +1: addl d3,d3 | shift b left until bit 20 is set + addxl d2,d2 | +#ifndef __mcoldfire__ + subw IMM (1),d5 | and adjust exponent +#else + subql IMM (1),d5 | and adjust exponent +#endif + btst IMM (20),d2 | + bne Lmuldf$2 | + bra 1b + + +|============================================================================= +| __divdf3 +|============================================================================= + +| double __divdf3(double, double); + FUNC(__divdf3) +SYM (__divdf3): +#ifndef __mcoldfire__ + link a6,IMM (0) + moveml d2-d7,sp@- +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + movel a6@(8),d0 | get a into d0-d1 + movel a6@(12),d1 | + movel a6@(16),d2 | and b into d2-d3 + movel a6@(20),d3 | + movel d0,d7 | d7 will hold the sign of the result + eorl d2,d7 | + andl IMM (0x80000000),d7 + movel d7,a0 | save sign into a0 + movel IMM (0x7ff00000),d7 | useful constant (+INFINITY) + movel d7,d6 | another (mask for fraction) + notl d6 | + bclr IMM (31),d0 | get rid of a's sign bit ' + movel d0,d4 | + orl d1,d4 | + beq Ldivdf$a$0 | branch if a is zero + movel d0,d4 | + bclr IMM (31),d2 | get rid of b's sign bit ' + movel d2,d5 | + orl d3,d5 | + beq Ldivdf$b$0 | branch if b is zero + movel d2,d5 + cmpl d7,d0 | is a big? + bhi Ldivdf$inop | if a is NaN return NaN + beq Ldivdf$a$nf | if d0 == 0x7ff00000 we check d1 + cmpl d7,d2 | now compare b with INFINITY + bhi Ldivdf$inop | if b is NaN return NaN + beq Ldivdf$b$nf | if d2 == 0x7ff00000 we check d3 +| Here we have both numbers finite and nonzero (and with no sign bit). +| Now we get the exponents into d4 and d5 and normalize the numbers to +| ensure that the ratio of the fractions is around 1. We do this by +| making sure that both numbers have bit #DBL_MANT_DIG-32-1 (hidden bit) +| set, even if they were denormalized to start with. +| Thus, the result will satisfy: 2 > result > 1/2. + andl d7,d4 | and isolate exponent in d4 + beq Ldivdf$a$den | if exponent is zero we have a denormalized + andl d6,d0 | and isolate fraction + orl IMM (0x00100000),d0 | and put hidden bit back + swap d4 | I like exponents in the first byte +#ifndef __mcoldfire__ + lsrw IMM (4),d4 | +#else + lsrl IMM (4),d4 | +#endif +Ldivdf$1: | + andl d7,d5 | + beq Ldivdf$b$den | + andl d6,d2 | + orl IMM (0x00100000),d2 + swap d5 | +#ifndef __mcoldfire__ + lsrw IMM (4),d5 | +#else + lsrl IMM (4),d5 | +#endif +Ldivdf$2: | +#ifndef __mcoldfire__ + subw d5,d4 | subtract exponents + addw IMM (D_BIAS),d4 | and add bias +#else + subl d5,d4 | subtract exponents + addl IMM (D_BIAS),d4 | and add bias +#endif + +| We are now ready to do the division. We have prepared things in such a way +| that the ratio of the fractions will be less than 2 but greater than 1/2. +| At this point the registers in use are: +| d0-d1 hold a (first operand, bit DBL_MANT_DIG-32=0, bit +| DBL_MANT_DIG-1-32=1) +| d2-d3 hold b (second operand, bit DBL_MANT_DIG-32=1) +| d4 holds the difference of the exponents, corrected by the bias +| a0 holds the sign of the ratio + +| To do the rounding correctly we need to keep information about the +| nonsignificant bits. One way to do this would be to do the division +| using four registers; another is to use two registers (as originally +| I did), but use a sticky bit to preserve information about the +| fractional part. Note that we can keep that info in a1, which is not +| used. + movel IMM (0),d6 | d6-d7 will hold the result + movel d6,d7 | + movel IMM (0),a1 | and a1 will hold the sticky bit + + movel IMM (DBL_MANT_DIG-32+1),d5 + +1: cmpl d0,d2 | is a < b? + bhi 3f | if b > a skip the following + beq 4f | if d0==d2 check d1 and d3 +2: subl d3,d1 | + subxl d2,d0 | a <-- a - b + bset d5,d6 | set the corresponding bit in d6 +3: addl d1,d1 | shift a by 1 + addxl d0,d0 | +#ifndef __mcoldfire__ + dbra d5,1b | and branch back +#else + subql IMM (1), d5 + bpl 1b +#endif + bra 5f +4: cmpl d1,d3 | here d0==d2, so check d1 and d3 + bhi 3b | if d1 > d2 skip the subtraction + bra 2b | else go do it +5: +| Here we have to start setting the bits in the second long. + movel IMM (31),d5 | again d5 is counter + +1: cmpl d0,d2 | is a < b? + bhi 3f | if b > a skip the following + beq 4f | if d0==d2 check d1 and d3 +2: subl d3,d1 | + subxl d2,d0 | a <-- a - b + bset d5,d7 | set the corresponding bit in d7 +3: addl d1,d1 | shift a by 1 + addxl d0,d0 | +#ifndef __mcoldfire__ + dbra d5,1b | and branch back +#else + subql IMM (1), d5 + bpl 1b +#endif + bra 5f +4: cmpl d1,d3 | here d0==d2, so check d1 and d3 + bhi 3b | if d1 > d2 skip the subtraction + bra 2b | else go do it +5: +| Now go ahead checking until we hit a one, which we store in d2. + movel IMM (DBL_MANT_DIG),d5 +1: cmpl d2,d0 | is a < b? + bhi 4f | if b < a, exit + beq 3f | if d0==d2 check d1 and d3 +2: addl d1,d1 | shift a by 1 + addxl d0,d0 | +#ifndef __mcoldfire__ + dbra d5,1b | and branch back +#else + subql IMM (1), d5 + bpl 1b +#endif + movel IMM (0),d2 | here no sticky bit was found + movel d2,d3 + bra 5f +3: cmpl d1,d3 | here d0==d2, so check d1 and d3 + bhi 2b | if d1 > d2 go back +4: +| Here put the sticky bit in d2-d3 (in the position which actually corresponds +| to it; if you don't do this the algorithm loses in some cases). ' + movel IMM (0),d2 + movel d2,d3 +#ifndef __mcoldfire__ + subw IMM (DBL_MANT_DIG),d5 + addw IMM (63),d5 + cmpw IMM (31),d5 +#else + subl IMM (DBL_MANT_DIG),d5 + addl IMM (63),d5 + cmpl IMM (31),d5 +#endif + bhi 2f +1: bset d5,d3 + bra 5f +#ifndef __mcoldfire__ + subw IMM (32),d5 +#else + subl IMM (32),d5 +#endif +2: bset d5,d2 +5: +| Finally we are finished! Move the longs in the address registers to +| their final destination: + movel d6,d0 + movel d7,d1 + movel IMM (0),d3 + +| Here we have finished the division, with the result in d0-d1-d2-d3, with +| 2^21 <= d6 < 2^23. Thus bit 23 is not set, but bit 22 could be set. +| If it is not, then definitely bit 21 is set. Normalize so bit 22 is +| not set: + btst IMM (DBL_MANT_DIG-32+1),d0 + beq 1f +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 + roxrl IMM (1),d2 + roxrl IMM (1),d3 + addw IMM (1),d4 +#else + lsrl IMM (1),d3 + btst IMM (0),d2 + beq 10f + bset IMM (31),d3 +10: lsrl IMM (1),d2 + btst IMM (0),d1 + beq 11f + bset IMM (31),d2 +11: lsrl IMM (1),d1 + btst IMM (0),d0 + beq 12f + bset IMM (31),d1 +12: lsrl IMM (1),d0 + addl IMM (1),d4 +#endif +1: +| Now round, check for over- and underflow, and exit. + movel a0,d7 | restore sign bit to d7 + moveq IMM (DIVIDE),d5 + bra Lround$exit + +Ldivdf$inop: + moveq IMM (DIVIDE),d5 + bra Ld$inop + +Ldivdf$a$0: +| If a is zero check to see whether b is zero also. In that case return +| NaN; then check if b is NaN, and return NaN also in that case. Else +| return a properly signed zero. + moveq IMM (DIVIDE),d5 + bclr IMM (31),d2 | + movel d2,d4 | + orl d3,d4 | + beq Ld$inop | if b is also zero return NaN + cmpl IMM (0x7ff00000),d2 | check for NaN + bhi Ld$inop | + blt 1f | + tstl d3 | + bne Ld$inop | +1: movel a0,d0 | else return signed zero + moveq IMM(0),d1 | + PICLEA SYM (_fpCCR),a0 | clear exception flags + movew IMM (0),a0@ | +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | +#else + moveml sp@,d2-d7 | + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 | + rts | + +Ldivdf$b$0: + moveq IMM (DIVIDE),d5 +| If we got here a is not zero. Check if a is NaN; in that case return NaN, +| else return +/-INFINITY. Remember that a is in d0 with the sign bit +| cleared already. + movel a0,d7 | put a's sign bit back in d7 ' + cmpl IMM (0x7ff00000),d0 | compare d0 with INFINITY + bhi Ld$inop | if larger it is NaN + tstl d1 | + bne Ld$inop | + bra Ld$div$0 | else signal DIVIDE_BY_ZERO + +Ldivdf$b$nf: + moveq IMM (DIVIDE),d5 +| If d2 == 0x7ff00000 we have to check d3. + tstl d3 | + bne Ld$inop | if d3 <> 0, b is NaN + bra Ld$underflow | else b is +/-INFINITY, so signal underflow + +Ldivdf$a$nf: + moveq IMM (DIVIDE),d5 +| If d0 == 0x7ff00000 we have to check d1. + tstl d1 | + bne Ld$inop | if d1 <> 0, a is NaN +| If a is INFINITY we have to check b + cmpl d7,d2 | compare b with INFINITY + bge Ld$inop | if b is NaN or INFINITY return NaN + tstl d3 | + bne Ld$inop | + bra Ld$overflow | else return overflow + +| If a number is denormalized we put an exponent of 1 but do not put the +| bit back into the fraction. +Ldivdf$a$den: + movel IMM (1),d4 + andl d6,d0 +1: addl d1,d1 | shift a left until bit 20 is set + addxl d0,d0 +#ifndef __mcoldfire__ + subw IMM (1),d4 | and adjust exponent +#else + subl IMM (1),d4 | and adjust exponent +#endif + btst IMM (DBL_MANT_DIG-32-1),d0 + bne Ldivdf$1 + bra 1b + +Ldivdf$b$den: + movel IMM (1),d5 + andl d6,d2 +1: addl d3,d3 | shift b left until bit 20 is set + addxl d2,d2 +#ifndef __mcoldfire__ + subw IMM (1),d5 | and adjust exponent +#else + subql IMM (1),d5 | and adjust exponent +#endif + btst IMM (DBL_MANT_DIG-32-1),d2 + bne Ldivdf$2 + bra 1b + +Lround$exit: +| This is a common exit point for __muldf3 and __divdf3. When they enter +| this point the sign of the result is in d7, the result in d0-d1, normalized +| so that 2^21 <= d0 < 2^22, and the exponent is in the lower byte of d4. + +| First check for underlow in the exponent: +#ifndef __mcoldfire__ + cmpw IMM (-DBL_MANT_DIG-1),d4 +#else + cmpl IMM (-DBL_MANT_DIG-1),d4 +#endif + blt Ld$underflow +| It could happen that the exponent is less than 1, in which case the +| number is denormalized. In this case we shift right and adjust the +| exponent until it becomes 1 or the fraction is zero (in the latter case +| we signal underflow and return zero). + movel d7,a0 | + movel IMM (0),d6 | use d6-d7 to collect bits flushed right + movel d6,d7 | use d6-d7 to collect bits flushed right +#ifndef __mcoldfire__ + cmpw IMM (1),d4 | if the exponent is less than 1 we +#else + cmpl IMM (1),d4 | if the exponent is less than 1 we +#endif + bge 2f | have to shift right (denormalize) +1: +#ifndef __mcoldfire__ + addw IMM (1),d4 | adjust the exponent + lsrl IMM (1),d0 | shift right once + roxrl IMM (1),d1 | + roxrl IMM (1),d2 | + roxrl IMM (1),d3 | + roxrl IMM (1),d6 | + roxrl IMM (1),d7 | + cmpw IMM (1),d4 | is the exponent 1 already? +#else + addl IMM (1),d4 | adjust the exponent + lsrl IMM (1),d7 + btst IMM (0),d6 + beq 13f + bset IMM (31),d7 +13: lsrl IMM (1),d6 + btst IMM (0),d3 + beq 14f + bset IMM (31),d6 +14: lsrl IMM (1),d3 + btst IMM (0),d2 + beq 10f + bset IMM (31),d3 +10: lsrl IMM (1),d2 + btst IMM (0),d1 + beq 11f + bset IMM (31),d2 +11: lsrl IMM (1),d1 + btst IMM (0),d0 + beq 12f + bset IMM (31),d1 +12: lsrl IMM (1),d0 + cmpl IMM (1),d4 | is the exponent 1 already? +#endif + beq 2f | if not loop back + bra 1b | + bra Ld$underflow | safety check, shouldn't execute ' +2: orl d6,d2 | this is a trick so we don't lose ' + orl d7,d3 | the bits which were flushed right + movel a0,d7 | get back sign bit into d7 +| Now call the rounding routine (which takes care of denormalized numbers): + lea pc@(Lround$0),a0 | to return from rounding routine + PICLEA SYM (_fpCCR),a1 | check the rounding mode +#ifdef __mcoldfire__ + clrl d6 +#endif + movew a1@(6),d6 | rounding mode in d6 + beq Lround$to$nearest +#ifndef __mcoldfire__ + cmpw IMM (ROUND_TO_PLUS),d6 +#else + cmpl IMM (ROUND_TO_PLUS),d6 +#endif + bhi Lround$to$minus + blt Lround$to$zero + bra Lround$to$plus +Lround$0: +| Here we have a correctly rounded result (either normalized or denormalized). + +| Here we should have either a normalized number or a denormalized one, and +| the exponent is necessarily larger or equal to 1 (so we don't have to ' +| check again for underflow!). We have to check for overflow or for a +| denormalized number (which also signals underflow). +| Check for overflow (i.e., exponent >= 0x7ff). +#ifndef __mcoldfire__ + cmpw IMM (0x07ff),d4 +#else + cmpl IMM (0x07ff),d4 +#endif + bge Ld$overflow +| Now check for a denormalized number (exponent==0): + movew d4,d4 + beq Ld$den +1: +| Put back the exponents and sign and return. +#ifndef __mcoldfire__ + lslw IMM (4),d4 | exponent back to fourth byte +#else + lsll IMM (4),d4 | exponent back to fourth byte +#endif + bclr IMM (DBL_MANT_DIG-32-1),d0 + swap d0 | and put back exponent +#ifndef __mcoldfire__ + orw d4,d0 | +#else + orl d4,d0 | +#endif + swap d0 | + orl d7,d0 | and sign also + + PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts + +|============================================================================= +| __negdf2 +|============================================================================= + +| double __negdf2(double, double); + FUNC(__negdf2) +SYM (__negdf2): +#ifndef __mcoldfire__ + link a6,IMM (0) + moveml d2-d7,sp@- +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + moveq IMM (NEGATE),d5 + movel a6@(8),d0 | get number to negate in d0-d1 + movel a6@(12),d1 | + bchg IMM (31),d0 | negate + movel d0,d2 | make a positive copy (for the tests) + bclr IMM (31),d2 | + movel d2,d4 | check for zero + orl d1,d4 | + beq 2f | if zero (either sign) return +zero + cmpl IMM (0x7ff00000),d2 | compare to +INFINITY + blt 1f | if finite, return + bhi Ld$inop | if larger (fraction not zero) is NaN + tstl d1 | if d2 == 0x7ff00000 check d1 + bne Ld$inop | + movel d0,d7 | else get sign and return INFINITY + andl IMM (0x80000000),d7 + bra Ld$infty +1: PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts +2: bclr IMM (31),d0 + bra 1b + +|============================================================================= +| __cmpdf2 +|============================================================================= + +GREATER = 1 +LESS = -1 +EQUAL = 0 + +| int __cmpdf2_internal(double, double, int); +SYM (__cmpdf2_internal): +#ifndef __mcoldfire__ + link a6,IMM (0) + moveml d2-d7,sp@- | save registers +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + moveq IMM (COMPARE),d5 + movel a6@(8),d0 | get first operand + movel a6@(12),d1 | + movel a6@(16),d2 | get second operand + movel a6@(20),d3 | +| First check if a and/or b are (+/-) zero and in that case clear +| the sign bit. + movel d0,d6 | copy signs into d6 (a) and d7(b) + bclr IMM (31),d0 | and clear signs in d0 and d2 + movel d2,d7 | + bclr IMM (31),d2 | + cmpl IMM (0x7ff00000),d0 | check for a == NaN + bhi Lcmpd$inop | if d0 > 0x7ff00000, a is NaN + beq Lcmpdf$a$nf | if equal can be INFINITY, so check d1 + movel d0,d4 | copy into d4 to test for zero + orl d1,d4 | + beq Lcmpdf$a$0 | +Lcmpdf$0: + cmpl IMM (0x7ff00000),d2 | check for b == NaN + bhi Lcmpd$inop | if d2 > 0x7ff00000, b is NaN + beq Lcmpdf$b$nf | if equal can be INFINITY, so check d3 + movel d2,d4 | + orl d3,d4 | + beq Lcmpdf$b$0 | +Lcmpdf$1: +| Check the signs + eorl d6,d7 + bpl 1f +| If the signs are not equal check if a >= 0 + tstl d6 + bpl Lcmpdf$a$gt$b | if (a >= 0 && b < 0) => a > b + bmi Lcmpdf$b$gt$a | if (a < 0 && b >= 0) => a < b +1: +| If the signs are equal check for < 0 + tstl d6 + bpl 1f +| If both are negative exchange them +#ifndef __mcoldfire__ + exg d0,d2 + exg d1,d3 +#else + movel d0,d7 + movel d2,d0 + movel d7,d2 + movel d1,d7 + movel d3,d1 + movel d7,d3 +#endif +1: +| Now that they are positive we just compare them as longs (does this also +| work for denormalized numbers?). + cmpl d0,d2 + bhi Lcmpdf$b$gt$a | |b| > |a| + bne Lcmpdf$a$gt$b | |b| < |a| +| If we got here d0 == d2, so we compare d1 and d3. + cmpl d1,d3 + bhi Lcmpdf$b$gt$a | |b| > |a| + bne Lcmpdf$a$gt$b | |b| < |a| +| If we got here a == b. + movel IMM (EQUAL),d0 +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | put back the registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts +Lcmpdf$a$gt$b: + movel IMM (GREATER),d0 +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | put back the registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts +Lcmpdf$b$gt$a: + movel IMM (LESS),d0 +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | put back the registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts + +Lcmpdf$a$0: + bclr IMM (31),d6 + bra Lcmpdf$0 +Lcmpdf$b$0: + bclr IMM (31),d7 + bra Lcmpdf$1 + +Lcmpdf$a$nf: + tstl d1 + bne Ld$inop + bra Lcmpdf$0 + +Lcmpdf$b$nf: + tstl d3 + bne Ld$inop + bra Lcmpdf$1 + +Lcmpd$inop: + movl a6@(24),d0 + moveq IMM (INEXACT_RESULT+INVALID_OPERATION),d7 + moveq IMM (DOUBLE_FLOAT),d6 + PICJUMP $_exception_handler + +| int __cmpdf2(double, double); + FUNC(__cmpdf2) +SYM (__cmpdf2): + link a6,IMM (0) + pea 1 + movl a6@(20),sp@- + movl a6@(16),sp@- + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpdf2_internal) + unlk a6 + rts + +|============================================================================= +| rounding routines +|============================================================================= + +| The rounding routines expect the number to be normalized in registers +| d0-d1-d2-d3, with the exponent in register d4. They assume that the +| exponent is larger or equal to 1. They return a properly normalized number +| if possible, and a denormalized number otherwise. The exponent is returned +| in d4. + +Lround$to$nearest: +| We now normalize as suggested by D. Knuth ("Seminumerical Algorithms"): +| Here we assume that the exponent is not too small (this should be checked +| before entering the rounding routine), but the number could be denormalized. + +| Check for denormalized numbers: +1: btst IMM (DBL_MANT_DIG-32),d0 + bne 2f | if set the number is normalized +| Normalize shifting left until bit #DBL_MANT_DIG-32 is set or the exponent +| is one (remember that a denormalized number corresponds to an +| exponent of -D_BIAS+1). +#ifndef __mcoldfire__ + cmpw IMM (1),d4 | remember that the exponent is at least one +#else + cmpl IMM (1),d4 | remember that the exponent is at least one +#endif + beq 2f | an exponent of one means denormalized + addl d3,d3 | else shift and adjust the exponent + addxl d2,d2 | + addxl d1,d1 | + addxl d0,d0 | +#ifndef __mcoldfire__ + dbra d4,1b | +#else + subql IMM (1), d4 + bpl 1b +#endif +2: +| Now round: we do it as follows: after the shifting we can write the +| fraction part as f + delta, where 1 < f < 2^25, and 0 <= delta <= 2. +| If delta < 1, do nothing. If delta > 1, add 1 to f. +| If delta == 1, we make sure the rounded number will be even (odd?) +| (after shifting). + btst IMM (0),d1 | is delta < 1? + beq 2f | if so, do not do anything + orl d2,d3 | is delta == 1? + bne 1f | if so round to even + movel d1,d3 | + andl IMM (2),d3 | bit 1 is the last significant bit + movel IMM (0),d2 | + addl d3,d1 | + addxl d2,d0 | + bra 2f | +1: movel IMM (1),d3 | else add 1 + movel IMM (0),d2 | + addl d3,d1 | + addxl d2,d0 +| Shift right once (because we used bit #DBL_MANT_DIG-32!). +2: +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 +#else + lsrl IMM (1),d1 + btst IMM (0),d0 + beq 10f + bset IMM (31),d1 +10: lsrl IMM (1),d0 +#endif + +| Now check again bit #DBL_MANT_DIG-32 (rounding could have produced a +| 'fraction overflow' ...). + btst IMM (DBL_MANT_DIG-32),d0 + beq 1f +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 + addw IMM (1),d4 +#else + lsrl IMM (1),d1 + btst IMM (0),d0 + beq 10f + bset IMM (31),d1 +10: lsrl IMM (1),d0 + addl IMM (1),d4 +#endif +1: +| If bit #DBL_MANT_DIG-32-1 is clear we have a denormalized number, so we +| have to put the exponent to zero and return a denormalized number. + btst IMM (DBL_MANT_DIG-32-1),d0 + beq 1f + jmp a0@ +1: movel IMM (0),d4 + jmp a0@ + +Lround$to$zero: +Lround$to$plus: +Lround$to$minus: + jmp a0@ +#endif /* L_double */ + +#ifdef L_float + + .globl SYM (_fpCCR) + .globl $_exception_handler + +QUIET_NaN = 0xffffffff +SIGNL_NaN = 0x7f800001 +INFINITY = 0x7f800000 + +F_MAX_EXP = 0xff +F_BIAS = 126 +FLT_MAX_EXP = F_MAX_EXP - F_BIAS +FLT_MIN_EXP = 1 - F_BIAS +FLT_MANT_DIG = 24 + +INEXACT_RESULT = 0x0001 +UNDERFLOW = 0x0002 +OVERFLOW = 0x0004 +DIVIDE_BY_ZERO = 0x0008 +INVALID_OPERATION = 0x0010 + +SINGLE_FLOAT = 1 + +NOOP = 0 +ADD = 1 +MULTIPLY = 2 +DIVIDE = 3 +NEGATE = 4 +COMPARE = 5 +EXTENDSFDF = 6 +TRUNCDFSF = 7 + +UNKNOWN = -1 +ROUND_TO_NEAREST = 0 | round result to nearest representable value +ROUND_TO_ZERO = 1 | round result towards zero +ROUND_TO_PLUS = 2 | round result towards plus infinity +ROUND_TO_MINUS = 3 | round result towards minus infinity + +| Entry points: + + .globl SYM (__addsf3) + .globl SYM (__subsf3) + .globl SYM (__mulsf3) + .globl SYM (__divsf3) + .globl SYM (__negsf2) + .globl SYM (__cmpsf2) + .globl SYM (__cmpsf2_internal) + .hidden SYM (__cmpsf2_internal) + +| These are common routines to return and signal exceptions. + + .text + .even + +Lf$den: +| Return and signal a denormalized number + orl d7,d0 + moveq IMM (INEXACT_RESULT+UNDERFLOW),d7 + moveq IMM (SINGLE_FLOAT),d6 + PICJUMP $_exception_handler + +Lf$infty: +Lf$overflow: +| Return a properly signed INFINITY and set the exception flags + movel IMM (INFINITY),d0 + orl d7,d0 + moveq IMM (INEXACT_RESULT+OVERFLOW),d7 + moveq IMM (SINGLE_FLOAT),d6 + PICJUMP $_exception_handler + +Lf$underflow: +| Return 0 and set the exception flags + moveq IMM (0),d0 + moveq IMM (INEXACT_RESULT+UNDERFLOW),d7 + moveq IMM (SINGLE_FLOAT),d6 + PICJUMP $_exception_handler + +Lf$inop: +| Return a quiet NaN and set the exception flags + movel IMM (QUIET_NaN),d0 + moveq IMM (INEXACT_RESULT+INVALID_OPERATION),d7 + moveq IMM (SINGLE_FLOAT),d6 + PICJUMP $_exception_handler + +Lf$div$0: +| Return a properly signed INFINITY and set the exception flags + movel IMM (INFINITY),d0 + orl d7,d0 + moveq IMM (INEXACT_RESULT+DIVIDE_BY_ZERO),d7 + moveq IMM (SINGLE_FLOAT),d6 + PICJUMP $_exception_handler + +|============================================================================= +|============================================================================= +| single precision routines +|============================================================================= +|============================================================================= + +| A single precision floating point number (float) has the format: +| +| struct _float { +| unsigned int sign : 1; /* sign bit */ +| unsigned int exponent : 8; /* exponent, shifted by 126 */ +| unsigned int fraction : 23; /* fraction */ +| } float; +| +| Thus sizeof(float) = 4 (32 bits). +| +| All the routines are callable from C programs, and return the result +| in the single register d0. They also preserve all registers except +| d0-d1 and a0-a1. + +|============================================================================= +| __subsf3 +|============================================================================= + +| float __subsf3(float, float); + FUNC(__subsf3) +SYM (__subsf3): + bchg IMM (31),sp@(8) | change sign of second operand + | and fall through +|============================================================================= +| __addsf3 +|============================================================================= + +| float __addsf3(float, float); + FUNC(__addsf3) +SYM (__addsf3): +#ifndef __mcoldfire__ + link a6,IMM (0) | everything will be done in registers + moveml d2-d7,sp@- | save all data registers but d0-d1 +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + movel a6@(8),d0 | get first operand + movel a6@(12),d1 | get second operand + movel d0,a0 | get d0's sign bit ' + addl d0,d0 | check and clear sign bit of a + beq Laddsf$b | if zero return second operand + movel d1,a1 | save b's sign bit ' + addl d1,d1 | get rid of sign bit + beq Laddsf$a | if zero return first operand + +| Get the exponents and check for denormalized and/or infinity. + + movel IMM (0x00ffffff),d4 | mask to get fraction + movel IMM (0x01000000),d5 | mask to put hidden bit back + + movel d0,d6 | save a to get exponent + andl d4,d0 | get fraction in d0 + notl d4 | make d4 into a mask for the exponent + andl d4,d6 | get exponent in d6 + beq Laddsf$a$den | branch if a is denormalized + cmpl d4,d6 | check for INFINITY or NaN + beq Laddsf$nf + swap d6 | put exponent into first word + orl d5,d0 | and put hidden bit back +Laddsf$1: +| Now we have a's exponent in d6 (second byte) and the mantissa in d0. ' + movel d1,d7 | get exponent in d7 + andl d4,d7 | + beq Laddsf$b$den | branch if b is denormalized + cmpl d4,d7 | check for INFINITY or NaN + beq Laddsf$nf + swap d7 | put exponent into first word + notl d4 | make d4 into a mask for the fraction + andl d4,d1 | get fraction in d1 + orl d5,d1 | and put hidden bit back +Laddsf$2: +| Now we have b's exponent in d7 (second byte) and the mantissa in d1. ' + +| Note that the hidden bit corresponds to bit #FLT_MANT_DIG-1, and we +| shifted right once, so bit #FLT_MANT_DIG is set (so we have one extra +| bit). + + movel d1,d2 | move b to d2, since we want to use + | two registers to do the sum + movel IMM (0),d1 | and clear the new ones + movel d1,d3 | + +| Here we shift the numbers in registers d0 and d1 so the exponents are the +| same, and put the largest exponent in d6. Note that we are using two +| registers for each number (see the discussion by D. Knuth in "Seminumerical +| Algorithms"). +#ifndef __mcoldfire__ + cmpw d6,d7 | compare exponents +#else + cmpl d6,d7 | compare exponents +#endif + beq Laddsf$3 | if equal don't shift ' + bhi 5f | branch if second exponent largest +1: + subl d6,d7 | keep the largest exponent + negl d7 +#ifndef __mcoldfire__ + lsrw IMM (8),d7 | put difference in lower byte +#else + lsrl IMM (8),d7 | put difference in lower byte +#endif +| if difference is too large we don't shift (actually, we can just exit) ' +#ifndef __mcoldfire__ + cmpw IMM (FLT_MANT_DIG+2),d7 +#else + cmpl IMM (FLT_MANT_DIG+2),d7 +#endif + bge Laddsf$b$small +#ifndef __mcoldfire__ + cmpw IMM (16),d7 | if difference >= 16 swap +#else + cmpl IMM (16),d7 | if difference >= 16 swap +#endif + bge 4f +2: +#ifndef __mcoldfire__ + subw IMM (1),d7 +#else + subql IMM (1), d7 +#endif +3: +#ifndef __mcoldfire__ + lsrl IMM (1),d2 | shift right second operand + roxrl IMM (1),d3 + dbra d7,3b +#else + lsrl IMM (1),d3 + btst IMM (0),d2 + beq 10f + bset IMM (31),d3 +10: lsrl IMM (1),d2 + subql IMM (1), d7 + bpl 3b +#endif + bra Laddsf$3 +4: + movew d2,d3 + swap d3 + movew d3,d2 + swap d2 +#ifndef __mcoldfire__ + subw IMM (16),d7 +#else + subl IMM (16),d7 +#endif + bne 2b | if still more bits, go back to normal case + bra Laddsf$3 +5: +#ifndef __mcoldfire__ + exg d6,d7 | exchange the exponents +#else + eorl d6,d7 + eorl d7,d6 + eorl d6,d7 +#endif + subl d6,d7 | keep the largest exponent + negl d7 | +#ifndef __mcoldfire__ + lsrw IMM (8),d7 | put difference in lower byte +#else + lsrl IMM (8),d7 | put difference in lower byte +#endif +| if difference is too large we don't shift (and exit!) ' +#ifndef __mcoldfire__ + cmpw IMM (FLT_MANT_DIG+2),d7 +#else + cmpl IMM (FLT_MANT_DIG+2),d7 +#endif + bge Laddsf$a$small +#ifndef __mcoldfire__ + cmpw IMM (16),d7 | if difference >= 16 swap +#else + cmpl IMM (16),d7 | if difference >= 16 swap +#endif + bge 8f +6: +#ifndef __mcoldfire__ + subw IMM (1),d7 +#else + subl IMM (1),d7 +#endif +7: +#ifndef __mcoldfire__ + lsrl IMM (1),d0 | shift right first operand + roxrl IMM (1),d1 + dbra d7,7b +#else + lsrl IMM (1),d1 + btst IMM (0),d0 + beq 10f + bset IMM (31),d1 +10: lsrl IMM (1),d0 + subql IMM (1),d7 + bpl 7b +#endif + bra Laddsf$3 +8: + movew d0,d1 + swap d1 + movew d1,d0 + swap d0 +#ifndef __mcoldfire__ + subw IMM (16),d7 +#else + subl IMM (16),d7 +#endif + bne 6b | if still more bits, go back to normal case + | otherwise we fall through + +| Now we have a in d0-d1, b in d2-d3, and the largest exponent in d6 (the +| signs are stored in a0 and a1). + +Laddsf$3: +| Here we have to decide whether to add or subtract the numbers +#ifndef __mcoldfire__ + exg d6,a0 | get signs back + exg d7,a1 | and save the exponents +#else + movel d6,d4 + movel a0,d6 + movel d4,a0 + movel d7,d4 + movel a1,d7 + movel d4,a1 +#endif + eorl d6,d7 | combine sign bits + bmi Lsubsf$0 | if negative a and b have opposite + | sign so we actually subtract the + | numbers + +| Here we have both positive or both negative +#ifndef __mcoldfire__ + exg d6,a0 | now we have the exponent in d6 +#else + movel d6,d4 + movel a0,d6 + movel d4,a0 +#endif + movel a0,d7 | and sign in d7 + andl IMM (0x80000000),d7 +| Here we do the addition. + addl d3,d1 + addxl d2,d0 +| Note: now we have d2, d3, d4 and d5 to play with! + +| Put the exponent, in the first byte, in d2, to use the "standard" rounding +| routines: + movel d6,d2 +#ifndef __mcoldfire__ + lsrw IMM (8),d2 +#else + lsrl IMM (8),d2 +#endif + +| Before rounding normalize so bit #FLT_MANT_DIG is set (we will consider +| the case of denormalized numbers in the rounding routine itself). +| As in the addition (not in the subtraction!) we could have set +| one more bit we check this: + btst IMM (FLT_MANT_DIG+1),d0 + beq 1f +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 +#else + lsrl IMM (1),d1 + btst IMM (0),d0 + beq 10f + bset IMM (31),d1 +10: lsrl IMM (1),d0 +#endif + addl IMM (1),d2 +1: + lea pc@(Laddsf$4),a0 | to return from rounding routine + PICLEA SYM (_fpCCR),a1 | check the rounding mode +#ifdef __mcoldfire__ + clrl d6 +#endif + movew a1@(6),d6 | rounding mode in d6 + beq Lround$to$nearest +#ifndef __mcoldfire__ + cmpw IMM (ROUND_TO_PLUS),d6 +#else + cmpl IMM (ROUND_TO_PLUS),d6 +#endif + bhi Lround$to$minus + blt Lround$to$zero + bra Lround$to$plus +Laddsf$4: +| Put back the exponent, but check for overflow. +#ifndef __mcoldfire__ + cmpw IMM (0xff),d2 +#else + cmpl IMM (0xff),d2 +#endif + bhi 1f + bclr IMM (FLT_MANT_DIG-1),d0 +#ifndef __mcoldfire__ + lslw IMM (7),d2 +#else + lsll IMM (7),d2 +#endif + swap d2 + orl d2,d0 + bra Laddsf$ret +1: + moveq IMM (ADD),d5 + bra Lf$overflow + +Lsubsf$0: +| We are here if a > 0 and b < 0 (sign bits cleared). +| Here we do the subtraction. + movel d6,d7 | put sign in d7 + andl IMM (0x80000000),d7 + + subl d3,d1 | result in d0-d1 + subxl d2,d0 | + beq Laddsf$ret | if zero just exit + bpl 1f | if positive skip the following + bchg IMM (31),d7 | change sign bit in d7 + negl d1 + negxl d0 +1: +#ifndef __mcoldfire__ + exg d2,a0 | now we have the exponent in d2 + lsrw IMM (8),d2 | put it in the first byte +#else + movel d2,d4 + movel a0,d2 + movel d4,a0 + lsrl IMM (8),d2 | put it in the first byte +#endif + +| Now d0-d1 is positive and the sign bit is in d7. + +| Note that we do not have to normalize, since in the subtraction bit +| #FLT_MANT_DIG+1 is never set, and denormalized numbers are handled by +| the rounding routines themselves. + lea pc@(Lsubsf$1),a0 | to return from rounding routine + PICLEA SYM (_fpCCR),a1 | check the rounding mode +#ifdef __mcoldfire__ + clrl d6 +#endif + movew a1@(6),d6 | rounding mode in d6 + beq Lround$to$nearest +#ifndef __mcoldfire__ + cmpw IMM (ROUND_TO_PLUS),d6 +#else + cmpl IMM (ROUND_TO_PLUS),d6 +#endif + bhi Lround$to$minus + blt Lround$to$zero + bra Lround$to$plus +Lsubsf$1: +| Put back the exponent (we can't have overflow!). ' + bclr IMM (FLT_MANT_DIG-1),d0 +#ifndef __mcoldfire__ + lslw IMM (7),d2 +#else + lsll IMM (7),d2 +#endif + swap d2 + orl d2,d0 + bra Laddsf$ret + +| If one of the numbers was too small (difference of exponents >= +| FLT_MANT_DIG+2) we return the other (and now we don't have to ' +| check for finiteness or zero). +Laddsf$a$small: + movel a6@(12),d0 + PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | restore data registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 | and return + rts + +Laddsf$b$small: + movel a6@(8),d0 + PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | restore data registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 | and return + rts + +| If the numbers are denormalized remember to put exponent equal to 1. + +Laddsf$a$den: + movel d5,d6 | d5 contains 0x01000000 + swap d6 + bra Laddsf$1 + +Laddsf$b$den: + movel d5,d7 + swap d7 + notl d4 | make d4 into a mask for the fraction + | (this was not executed after the jump) + bra Laddsf$2 + +| The rest is mainly code for the different results which can be +| returned (checking always for +/-INFINITY and NaN). + +Laddsf$b: +| Return b (if a is zero). + movel a6@(12),d0 + cmpl IMM (0x80000000),d0 | Check if b is -0 + bne 1f + movel a0,d7 + andl IMM (0x80000000),d7 | Use the sign of a + clrl d0 + bra Laddsf$ret +Laddsf$a: +| Return a (if b is zero). + movel a6@(8),d0 +1: + moveq IMM (ADD),d5 +| We have to check for NaN and +/-infty. + movel d0,d7 + andl IMM (0x80000000),d7 | put sign in d7 + bclr IMM (31),d0 | clear sign + cmpl IMM (INFINITY),d0 | check for infty or NaN + bge 2f + movel d0,d0 | check for zero (we do this because we don't ' + bne Laddsf$ret | want to return -0 by mistake + bclr IMM (31),d7 | if zero be sure to clear sign + bra Laddsf$ret | if everything OK just return +2: +| The value to be returned is either +/-infty or NaN + andl IMM (0x007fffff),d0 | check for NaN + bne Lf$inop | if mantissa not zero is NaN + bra Lf$infty + +Laddsf$ret: +| Normal exit (a and b nonzero, result is not NaN nor +/-infty). +| We have to clear the exception flags (just the exception type). + PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ + orl d7,d0 | put sign bit +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | restore data registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 | and return + rts + +Laddsf$ret$den: +| Return a denormalized number (for addition we don't signal underflow) ' + lsrl IMM (1),d0 | remember to shift right back once + bra Laddsf$ret | and return + +| Note: when adding two floats of the same sign if either one is +| NaN we return NaN without regard to whether the other is finite or +| not. When subtracting them (i.e., when adding two numbers of +| opposite signs) things are more complicated: if both are INFINITY +| we return NaN, if only one is INFINITY and the other is NaN we return +| NaN, but if it is finite we return INFINITY with the corresponding sign. + +Laddsf$nf: + moveq IMM (ADD),d5 +| This could be faster but it is not worth the effort, since it is not +| executed very often. We sacrifice speed for clarity here. + movel a6@(8),d0 | get the numbers back (remember that we + movel a6@(12),d1 | did some processing already) + movel IMM (INFINITY),d4 | useful constant (INFINITY) + movel d0,d2 | save sign bits + movel d1,d3 + bclr IMM (31),d0 | clear sign bits + bclr IMM (31),d1 +| We know that one of them is either NaN of +/-INFINITY +| Check for NaN (if either one is NaN return NaN) + cmpl d4,d0 | check first a (d0) + bhi Lf$inop + cmpl d4,d1 | check now b (d1) + bhi Lf$inop +| Now comes the check for +/-INFINITY. We know that both are (maybe not +| finite) numbers, but we have to check if both are infinite whether we +| are adding or subtracting them. + eorl d3,d2 | to check sign bits + bmi 1f + movel d0,d7 + andl IMM (0x80000000),d7 | get (common) sign bit + bra Lf$infty +1: +| We know one (or both) are infinite, so we test for equality between the +| two numbers (if they are equal they have to be infinite both, so we +| return NaN). + cmpl d1,d0 | are both infinite? + beq Lf$inop | if so return NaN + + movel d0,d7 + andl IMM (0x80000000),d7 | get a's sign bit ' + cmpl d4,d0 | test now for infinity + beq Lf$infty | if a is INFINITY return with this sign + bchg IMM (31),d7 | else we know b is INFINITY and has + bra Lf$infty | the opposite sign + +|============================================================================= +| __mulsf3 +|============================================================================= + +| float __mulsf3(float, float); + FUNC(__mulsf3) +SYM (__mulsf3): +#ifndef __mcoldfire__ + link a6,IMM (0) + moveml d2-d7,sp@- +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + movel a6@(8),d0 | get a into d0 + movel a6@(12),d1 | and b into d1 + movel d0,d7 | d7 will hold the sign of the product + eorl d1,d7 | + andl IMM (0x80000000),d7 + movel IMM (INFINITY),d6 | useful constant (+INFINITY) + movel d6,d5 | another (mask for fraction) + notl d5 | + movel IMM (0x00800000),d4 | this is to put hidden bit back + bclr IMM (31),d0 | get rid of a's sign bit ' + movel d0,d2 | + beq Lmulsf$a$0 | branch if a is zero + bclr IMM (31),d1 | get rid of b's sign bit ' + movel d1,d3 | + beq Lmulsf$b$0 | branch if b is zero + cmpl d6,d0 | is a big? + bhi Lmulsf$inop | if a is NaN return NaN + beq Lmulsf$inf | if a is INFINITY we have to check b + cmpl d6,d1 | now compare b with INFINITY + bhi Lmulsf$inop | is b NaN? + beq Lmulsf$overflow | is b INFINITY? +| Here we have both numbers finite and nonzero (and with no sign bit). +| Now we get the exponents into d2 and d3. + andl d6,d2 | and isolate exponent in d2 + beq Lmulsf$a$den | if exponent is zero we have a denormalized + andl d5,d0 | and isolate fraction + orl d4,d0 | and put hidden bit back + swap d2 | I like exponents in the first byte +#ifndef __mcoldfire__ + lsrw IMM (7),d2 | +#else + lsrl IMM (7),d2 | +#endif +Lmulsf$1: | number + andl d6,d3 | + beq Lmulsf$b$den | + andl d5,d1 | + orl d4,d1 | + swap d3 | +#ifndef __mcoldfire__ + lsrw IMM (7),d3 | +#else + lsrl IMM (7),d3 | +#endif +Lmulsf$2: | +#ifndef __mcoldfire__ + addw d3,d2 | add exponents + subw IMM (F_BIAS+1),d2 | and subtract bias (plus one) +#else + addl d3,d2 | add exponents + subl IMM (F_BIAS+1),d2 | and subtract bias (plus one) +#endif + +| We are now ready to do the multiplication. The situation is as follows: +| both a and b have bit FLT_MANT_DIG-1 set (even if they were +| denormalized to start with!), which means that in the product +| bit 2*(FLT_MANT_DIG-1) (that is, bit 2*FLT_MANT_DIG-2-32 of the +| high long) is set. + +| To do the multiplication let us move the number a little bit around ... + movel d1,d6 | second operand in d6 + movel d0,d5 | first operand in d4-d5 + movel IMM (0),d4 + movel d4,d1 | the sums will go in d0-d1 + movel d4,d0 + +| now bit FLT_MANT_DIG-1 becomes bit 31: + lsll IMM (31-FLT_MANT_DIG+1),d6 + +| Start the loop (we loop #FLT_MANT_DIG times): + moveq IMM (FLT_MANT_DIG-1),d3 +1: addl d1,d1 | shift sum + addxl d0,d0 + lsll IMM (1),d6 | get bit bn + bcc 2f | if not set skip sum + addl d5,d1 | add a + addxl d4,d0 +2: +#ifndef __mcoldfire__ + dbf d3,1b | loop back +#else + subql IMM (1),d3 + bpl 1b +#endif + +| Now we have the product in d0-d1, with bit (FLT_MANT_DIG - 1) + FLT_MANT_DIG +| (mod 32) of d0 set. The first thing to do now is to normalize it so bit +| FLT_MANT_DIG is set (to do the rounding). +#ifndef __mcoldfire__ + rorl IMM (6),d1 + swap d1 + movew d1,d3 + andw IMM (0x03ff),d3 + andw IMM (0xfd00),d1 +#else + movel d1,d3 + lsll IMM (8),d1 + addl d1,d1 + addl d1,d1 + moveq IMM (22),d5 + lsrl d5,d3 + orl d3,d1 + andl IMM (0xfffffd00),d1 +#endif + lsll IMM (8),d0 + addl d0,d0 + addl d0,d0 +#ifndef __mcoldfire__ + orw d3,d0 +#else + orl d3,d0 +#endif + + moveq IMM (MULTIPLY),d5 + + btst IMM (FLT_MANT_DIG+1),d0 + beq Lround$exit +#ifndef __mcoldfire__ + lsrl IMM (1),d0 + roxrl IMM (1),d1 + addw IMM (1),d2 +#else + lsrl IMM (1),d1 + btst IMM (0),d0 + beq 10f + bset IMM (31),d1 +10: lsrl IMM (1),d0 + addql IMM (1),d2 +#endif + bra Lround$exit + +Lmulsf$inop: + moveq IMM (MULTIPLY),d5 + bra Lf$inop + +Lmulsf$overflow: + moveq IMM (MULTIPLY),d5 + bra Lf$overflow + +Lmulsf$inf: + moveq IMM (MULTIPLY),d5 +| If either is NaN return NaN; else both are (maybe infinite) numbers, so +| return INFINITY with the correct sign (which is in d7). + cmpl d6,d1 | is b NaN? + bhi Lf$inop | if so return NaN + bra Lf$overflow | else return +/-INFINITY + +| If either number is zero return zero, unless the other is +/-INFINITY, +| or NaN, in which case we return NaN. +Lmulsf$b$0: +| Here d1 (==b) is zero. + movel a6@(8),d1 | get a again to check for non-finiteness + bra 1f +Lmulsf$a$0: + movel a6@(12),d1 | get b again to check for non-finiteness +1: bclr IMM (31),d1 | clear sign bit + cmpl IMM (INFINITY),d1 | and check for a large exponent + bge Lf$inop | if b is +/-INFINITY or NaN return NaN + movel d7,d0 | else return signed zero + PICLEA SYM (_fpCCR),a0 | + movew IMM (0),a0@ | +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 | + rts | + +| If a number is denormalized we put an exponent of 1 but do not put the +| hidden bit back into the fraction; instead we shift left until bit 23 +| (the hidden bit) is set, adjusting the exponent accordingly. We do this +| to ensure that the product of the fractions is close to 1. +Lmulsf$a$den: + movel IMM (1),d2 + andl d5,d0 +1: addl d0,d0 | shift a left (until bit 23 is set) +#ifndef __mcoldfire__ + subw IMM (1),d2 | and adjust exponent +#else + subql IMM (1),d2 | and adjust exponent +#endif + btst IMM (FLT_MANT_DIG-1),d0 + bne Lmulsf$1 | + bra 1b | else loop back + +Lmulsf$b$den: + movel IMM (1),d3 + andl d5,d1 +1: addl d1,d1 | shift b left until bit 23 is set +#ifndef __mcoldfire__ + subw IMM (1),d3 | and adjust exponent +#else + subql IMM (1),d3 | and adjust exponent +#endif + btst IMM (FLT_MANT_DIG-1),d1 + bne Lmulsf$2 | + bra 1b | else loop back + +|============================================================================= +| __divsf3 +|============================================================================= + +| float __divsf3(float, float); + FUNC(__divsf3) +SYM (__divsf3): +#ifndef __mcoldfire__ + link a6,IMM (0) + moveml d2-d7,sp@- +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + movel a6@(8),d0 | get a into d0 + movel a6@(12),d1 | and b into d1 + movel d0,d7 | d7 will hold the sign of the result + eorl d1,d7 | + andl IMM (0x80000000),d7 | + movel IMM (INFINITY),d6 | useful constant (+INFINITY) + movel d6,d5 | another (mask for fraction) + notl d5 | + movel IMM (0x00800000),d4 | this is to put hidden bit back + bclr IMM (31),d0 | get rid of a's sign bit ' + movel d0,d2 | + beq Ldivsf$a$0 | branch if a is zero + bclr IMM (31),d1 | get rid of b's sign bit ' + movel d1,d3 | + beq Ldivsf$b$0 | branch if b is zero + cmpl d6,d0 | is a big? + bhi Ldivsf$inop | if a is NaN return NaN + beq Ldivsf$inf | if a is INFINITY we have to check b + cmpl d6,d1 | now compare b with INFINITY + bhi Ldivsf$inop | if b is NaN return NaN + beq Ldivsf$underflow +| Here we have both numbers finite and nonzero (and with no sign bit). +| Now we get the exponents into d2 and d3 and normalize the numbers to +| ensure that the ratio of the fractions is close to 1. We do this by +| making sure that bit #FLT_MANT_DIG-1 (hidden bit) is set. + andl d6,d2 | and isolate exponent in d2 + beq Ldivsf$a$den | if exponent is zero we have a denormalized + andl d5,d0 | and isolate fraction + orl d4,d0 | and put hidden bit back + swap d2 | I like exponents in the first byte +#ifndef __mcoldfire__ + lsrw IMM (7),d2 | +#else + lsrl IMM (7),d2 | +#endif +Ldivsf$1: | + andl d6,d3 | + beq Ldivsf$b$den | + andl d5,d1 | + orl d4,d1 | + swap d3 | +#ifndef __mcoldfire__ + lsrw IMM (7),d3 | +#else + lsrl IMM (7),d3 | +#endif +Ldivsf$2: | +#ifndef __mcoldfire__ + subw d3,d2 | subtract exponents + addw IMM (F_BIAS),d2 | and add bias +#else + subl d3,d2 | subtract exponents + addl IMM (F_BIAS),d2 | and add bias +#endif + +| We are now ready to do the division. We have prepared things in such a way +| that the ratio of the fractions will be less than 2 but greater than 1/2. +| At this point the registers in use are: +| d0 holds a (first operand, bit FLT_MANT_DIG=0, bit FLT_MANT_DIG-1=1) +| d1 holds b (second operand, bit FLT_MANT_DIG=1) +| d2 holds the difference of the exponents, corrected by the bias +| d7 holds the sign of the ratio +| d4, d5, d6 hold some constants + movel d7,a0 | d6-d7 will hold the ratio of the fractions + movel IMM (0),d6 | + movel d6,d7 + + moveq IMM (FLT_MANT_DIG+1),d3 +1: cmpl d0,d1 | is a < b? + bhi 2f | + bset d3,d6 | set a bit in d6 + subl d1,d0 | if a >= b a <-- a-b + beq 3f | if a is zero, exit +2: addl d0,d0 | multiply a by 2 +#ifndef __mcoldfire__ + dbra d3,1b +#else + subql IMM (1),d3 + bpl 1b +#endif + +| Now we keep going to set the sticky bit ... + moveq IMM (FLT_MANT_DIG),d3 +1: cmpl d0,d1 + ble 2f + addl d0,d0 +#ifndef __mcoldfire__ + dbra d3,1b +#else + subql IMM(1),d3 + bpl 1b +#endif + movel IMM (0),d1 + bra 3f +2: movel IMM (0),d1 +#ifndef __mcoldfire__ + subw IMM (FLT_MANT_DIG),d3 + addw IMM (31),d3 +#else + subl IMM (FLT_MANT_DIG),d3 + addl IMM (31),d3 +#endif + bset d3,d1 +3: + movel d6,d0 | put the ratio in d0-d1 + movel a0,d7 | get sign back + +| Because of the normalization we did before we are guaranteed that +| d0 is smaller than 2^26 but larger than 2^24. Thus bit 26 is not set, +| bit 25 could be set, and if it is not set then bit 24 is necessarily set. + btst IMM (FLT_MANT_DIG+1),d0 + beq 1f | if it is not set, then bit 24 is set + lsrl IMM (1),d0 | +#ifndef __mcoldfire__ + addw IMM (1),d2 | +#else + addl IMM (1),d2 | +#endif +1: +| Now round, check for over- and underflow, and exit. + moveq IMM (DIVIDE),d5 + bra Lround$exit + +Ldivsf$inop: + moveq IMM (DIVIDE),d5 + bra Lf$inop + +Ldivsf$overflow: + moveq IMM (DIVIDE),d5 + bra Lf$overflow + +Ldivsf$underflow: + moveq IMM (DIVIDE),d5 + bra Lf$underflow + +Ldivsf$a$0: + moveq IMM (DIVIDE),d5 +| If a is zero check to see whether b is zero also. In that case return +| NaN; then check if b is NaN, and return NaN also in that case. Else +| return a properly signed zero. + andl IMM (0x7fffffff),d1 | clear sign bit and test b + beq Lf$inop | if b is also zero return NaN + cmpl IMM (INFINITY),d1 | check for NaN + bhi Lf$inop | + movel d7,d0 | else return signed zero + PICLEA SYM (_fpCCR),a0 | + movew IMM (0),a0@ | +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | +#else + moveml sp@,d2-d7 | + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 | + rts | + +Ldivsf$b$0: + moveq IMM (DIVIDE),d5 +| If we got here a is not zero. Check if a is NaN; in that case return NaN, +| else return +/-INFINITY. Remember that a is in d0 with the sign bit +| cleared already. + cmpl IMM (INFINITY),d0 | compare d0 with INFINITY + bhi Lf$inop | if larger it is NaN + bra Lf$div$0 | else signal DIVIDE_BY_ZERO + +Ldivsf$inf: + moveq IMM (DIVIDE),d5 +| If a is INFINITY we have to check b + cmpl IMM (INFINITY),d1 | compare b with INFINITY + bge Lf$inop | if b is NaN or INFINITY return NaN + bra Lf$overflow | else return overflow + +| If a number is denormalized we put an exponent of 1 but do not put the +| bit back into the fraction. +Ldivsf$a$den: + movel IMM (1),d2 + andl d5,d0 +1: addl d0,d0 | shift a left until bit FLT_MANT_DIG-1 is set +#ifndef __mcoldfire__ + subw IMM (1),d2 | and adjust exponent +#else + subl IMM (1),d2 | and adjust exponent +#endif + btst IMM (FLT_MANT_DIG-1),d0 + bne Ldivsf$1 + bra 1b + +Ldivsf$b$den: + movel IMM (1),d3 + andl d5,d1 +1: addl d1,d1 | shift b left until bit FLT_MANT_DIG is set +#ifndef __mcoldfire__ + subw IMM (1),d3 | and adjust exponent +#else + subl IMM (1),d3 | and adjust exponent +#endif + btst IMM (FLT_MANT_DIG-1),d1 + bne Ldivsf$2 + bra 1b + +Lround$exit: +| This is a common exit point for __mulsf3 and __divsf3. + +| First check for underlow in the exponent: +#ifndef __mcoldfire__ + cmpw IMM (-FLT_MANT_DIG-1),d2 +#else + cmpl IMM (-FLT_MANT_DIG-1),d2 +#endif + blt Lf$underflow +| It could happen that the exponent is less than 1, in which case the +| number is denormalized. In this case we shift right and adjust the +| exponent until it becomes 1 or the fraction is zero (in the latter case +| we signal underflow and return zero). + movel IMM (0),d6 | d6 is used temporarily +#ifndef __mcoldfire__ + cmpw IMM (1),d2 | if the exponent is less than 1 we +#else + cmpl IMM (1),d2 | if the exponent is less than 1 we +#endif + bge 2f | have to shift right (denormalize) +1: +#ifndef __mcoldfire__ + addw IMM (1),d2 | adjust the exponent + lsrl IMM (1),d0 | shift right once + roxrl IMM (1),d1 | + roxrl IMM (1),d6 | d6 collect bits we would lose otherwise + cmpw IMM (1),d2 | is the exponent 1 already? +#else + addql IMM (1),d2 | adjust the exponent + lsrl IMM (1),d6 + btst IMM (0),d1 + beq 11f + bset IMM (31),d6 +11: lsrl IMM (1),d1 + btst IMM (0),d0 + beq 10f + bset IMM (31),d1 +10: lsrl IMM (1),d0 + cmpl IMM (1),d2 | is the exponent 1 already? +#endif + beq 2f | if not loop back + bra 1b | + bra Lf$underflow | safety check, shouldn't execute ' +2: orl d6,d1 | this is a trick so we don't lose ' + | the extra bits which were flushed right +| Now call the rounding routine (which takes care of denormalized numbers): + lea pc@(Lround$0),a0 | to return from rounding routine + PICLEA SYM (_fpCCR),a1 | check the rounding mode +#ifdef __mcoldfire__ + clrl d6 +#endif + movew a1@(6),d6 | rounding mode in d6 + beq Lround$to$nearest +#ifndef __mcoldfire__ + cmpw IMM (ROUND_TO_PLUS),d6 +#else + cmpl IMM (ROUND_TO_PLUS),d6 +#endif + bhi Lround$to$minus + blt Lround$to$zero + bra Lround$to$plus +Lround$0: +| Here we have a correctly rounded result (either normalized or denormalized). + +| Here we should have either a normalized number or a denormalized one, and +| the exponent is necessarily larger or equal to 1 (so we don't have to ' +| check again for underflow!). We have to check for overflow or for a +| denormalized number (which also signals underflow). +| Check for overflow (i.e., exponent >= 255). +#ifndef __mcoldfire__ + cmpw IMM (0x00ff),d2 +#else + cmpl IMM (0x00ff),d2 +#endif + bge Lf$overflow +| Now check for a denormalized number (exponent==0). + movew d2,d2 + beq Lf$den +1: +| Put back the exponents and sign and return. +#ifndef __mcoldfire__ + lslw IMM (7),d2 | exponent back to fourth byte +#else + lsll IMM (7),d2 | exponent back to fourth byte +#endif + bclr IMM (FLT_MANT_DIG-1),d0 + swap d0 | and put back exponent +#ifndef __mcoldfire__ + orw d2,d0 | +#else + orl d2,d0 +#endif + swap d0 | + orl d7,d0 | and sign also + + PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts + +|============================================================================= +| __negsf2 +|============================================================================= + +| This is trivial and could be shorter if we didn't bother checking for NaN ' +| and +/-INFINITY. + +| float __negsf2(float); + FUNC(__negsf2) +SYM (__negsf2): +#ifndef __mcoldfire__ + link a6,IMM (0) + moveml d2-d7,sp@- +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + moveq IMM (NEGATE),d5 + movel a6@(8),d0 | get number to negate in d0 + bchg IMM (31),d0 | negate + movel d0,d1 | make a positive copy + bclr IMM (31),d1 | + tstl d1 | check for zero + beq 2f | if zero (either sign) return +zero + cmpl IMM (INFINITY),d1 | compare to +INFINITY + blt 1f | + bhi Lf$inop | if larger (fraction not zero) is NaN + movel d0,d7 | else get sign and return INFINITY + andl IMM (0x80000000),d7 + bra Lf$infty +1: PICLEA SYM (_fpCCR),a0 + movew IMM (0),a0@ +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts +2: bclr IMM (31),d0 + bra 1b + +|============================================================================= +| __cmpsf2 +|============================================================================= + +GREATER = 1 +LESS = -1 +EQUAL = 0 + +| int __cmpsf2_internal(float, float, int); +SYM (__cmpsf2_internal): +#ifndef __mcoldfire__ + link a6,IMM (0) + moveml d2-d7,sp@- | save registers +#else + link a6,IMM (-24) + moveml d2-d7,sp@ +#endif + moveq IMM (COMPARE),d5 + movel a6@(8),d0 | get first operand + movel a6@(12),d1 | get second operand +| Check if either is NaN, and in that case return garbage and signal +| INVALID_OPERATION. Check also if either is zero, and clear the signs +| if necessary. + movel d0,d6 + andl IMM (0x7fffffff),d0 + beq Lcmpsf$a$0 + cmpl IMM (0x7f800000),d0 + bhi Lcmpf$inop +Lcmpsf$1: + movel d1,d7 + andl IMM (0x7fffffff),d1 + beq Lcmpsf$b$0 + cmpl IMM (0x7f800000),d1 + bhi Lcmpf$inop +Lcmpsf$2: +| Check the signs + eorl d6,d7 + bpl 1f +| If the signs are not equal check if a >= 0 + tstl d6 + bpl Lcmpsf$a$gt$b | if (a >= 0 && b < 0) => a > b + bmi Lcmpsf$b$gt$a | if (a < 0 && b >= 0) => a < b +1: +| If the signs are equal check for < 0 + tstl d6 + bpl 1f +| If both are negative exchange them +#ifndef __mcoldfire__ + exg d0,d1 +#else + movel d0,d7 + movel d1,d0 + movel d7,d1 +#endif +1: +| Now that they are positive we just compare them as longs (does this also +| work for denormalized numbers?). + cmpl d0,d1 + bhi Lcmpsf$b$gt$a | |b| > |a| + bne Lcmpsf$a$gt$b | |b| < |a| +| If we got here a == b. + movel IMM (EQUAL),d0 +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | put back the registers +#else + moveml sp@,d2-d7 +#endif + unlk a6 + rts +Lcmpsf$a$gt$b: + movel IMM (GREATER),d0 +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | put back the registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts +Lcmpsf$b$gt$a: + movel IMM (LESS),d0 +#ifndef __mcoldfire__ + moveml sp@+,d2-d7 | put back the registers +#else + moveml sp@,d2-d7 + | XXX if frame pointer is ever removed, stack pointer must + | be adjusted here. +#endif + unlk a6 + rts + +Lcmpsf$a$0: + bclr IMM (31),d6 + bra Lcmpsf$1 +Lcmpsf$b$0: + bclr IMM (31),d7 + bra Lcmpsf$2 + +Lcmpf$inop: + movl a6@(16),d0 + moveq IMM (INEXACT_RESULT+INVALID_OPERATION),d7 + moveq IMM (SINGLE_FLOAT),d6 + PICJUMP $_exception_handler + +| int __cmpsf2(float, float); + FUNC(__cmpsf2) +SYM (__cmpsf2): + link a6,IMM (0) + pea 1 + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpsf2_internal) + unlk a6 + rts + +|============================================================================= +| rounding routines +|============================================================================= + +| The rounding routines expect the number to be normalized in registers +| d0-d1, with the exponent in register d2. They assume that the +| exponent is larger or equal to 1. They return a properly normalized number +| if possible, and a denormalized number otherwise. The exponent is returned +| in d2. + +Lround$to$nearest: +| We now normalize as suggested by D. Knuth ("Seminumerical Algorithms"): +| Here we assume that the exponent is not too small (this should be checked +| before entering the rounding routine), but the number could be denormalized. + +| Check for denormalized numbers: +1: btst IMM (FLT_MANT_DIG),d0 + bne 2f | if set the number is normalized +| Normalize shifting left until bit #FLT_MANT_DIG is set or the exponent +| is one (remember that a denormalized number corresponds to an +| exponent of -F_BIAS+1). +#ifndef __mcoldfire__ + cmpw IMM (1),d2 | remember that the exponent is at least one +#else + cmpl IMM (1),d2 | remember that the exponent is at least one +#endif + beq 2f | an exponent of one means denormalized + addl d1,d1 | else shift and adjust the exponent + addxl d0,d0 | +#ifndef __mcoldfire__ + dbra d2,1b | +#else + subql IMM (1),d2 + bpl 1b +#endif +2: +| Now round: we do it as follows: after the shifting we can write the +| fraction part as f + delta, where 1 < f < 2^25, and 0 <= delta <= 2. +| If delta < 1, do nothing. If delta > 1, add 1 to f. +| If delta == 1, we make sure the rounded number will be even (odd?) +| (after shifting). + btst IMM (0),d0 | is delta < 1? + beq 2f | if so, do not do anything + tstl d1 | is delta == 1? + bne 1f | if so round to even + movel d0,d1 | + andl IMM (2),d1 | bit 1 is the last significant bit + addl d1,d0 | + bra 2f | +1: movel IMM (1),d1 | else add 1 + addl d1,d0 | +| Shift right once (because we used bit #FLT_MANT_DIG!). +2: lsrl IMM (1),d0 +| Now check again bit #FLT_MANT_DIG (rounding could have produced a +| 'fraction overflow' ...). + btst IMM (FLT_MANT_DIG),d0 + beq 1f + lsrl IMM (1),d0 +#ifndef __mcoldfire__ + addw IMM (1),d2 +#else + addql IMM (1),d2 +#endif +1: +| If bit #FLT_MANT_DIG-1 is clear we have a denormalized number, so we +| have to put the exponent to zero and return a denormalized number. + btst IMM (FLT_MANT_DIG-1),d0 + beq 1f + jmp a0@ +1: movel IMM (0),d2 + jmp a0@ + +Lround$to$zero: +Lround$to$plus: +Lround$to$minus: + jmp a0@ +#endif /* L_float */ + +| gcc expects the routines __eqdf2, __nedf2, __gtdf2, __gedf2, +| __ledf2, __ltdf2 to all return the same value as a direct call to +| __cmpdf2 would. In this implementation, each of these routines +| simply calls __cmpdf2. It would be more efficient to give the +| __cmpdf2 routine several names, but separating them out will make it +| easier to write efficient versions of these routines someday. +| If the operands recompare unordered unordered __gtdf2 and __gedf2 return -1. +| The other routines return 1. + +#ifdef L_eqdf2 + .text + FUNC(__eqdf2) + .globl SYM (__eqdf2) +SYM (__eqdf2): + link a6,IMM (0) + pea 1 + movl a6@(20),sp@- + movl a6@(16),sp@- + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpdf2_internal) + unlk a6 + rts +#endif /* L_eqdf2 */ + +#ifdef L_nedf2 + .text + FUNC(__nedf2) + .globl SYM (__nedf2) +SYM (__nedf2): + link a6,IMM (0) + pea 1 + movl a6@(20),sp@- + movl a6@(16),sp@- + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpdf2_internal) + unlk a6 + rts +#endif /* L_nedf2 */ + +#ifdef L_gtdf2 + .text + FUNC(__gtdf2) + .globl SYM (__gtdf2) +SYM (__gtdf2): + link a6,IMM (0) + pea -1 + movl a6@(20),sp@- + movl a6@(16),sp@- + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpdf2_internal) + unlk a6 + rts +#endif /* L_gtdf2 */ + +#ifdef L_gedf2 + .text + FUNC(__gedf2) + .globl SYM (__gedf2) +SYM (__gedf2): + link a6,IMM (0) + pea -1 + movl a6@(20),sp@- + movl a6@(16),sp@- + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpdf2_internal) + unlk a6 + rts +#endif /* L_gedf2 */ + +#ifdef L_ltdf2 + .text + FUNC(__ltdf2) + .globl SYM (__ltdf2) +SYM (__ltdf2): + link a6,IMM (0) + pea 1 + movl a6@(20),sp@- + movl a6@(16),sp@- + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpdf2_internal) + unlk a6 + rts +#endif /* L_ltdf2 */ + +#ifdef L_ledf2 + .text + FUNC(__ledf2) + .globl SYM (__ledf2) +SYM (__ledf2): + link a6,IMM (0) + pea 1 + movl a6@(20),sp@- + movl a6@(16),sp@- + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpdf2_internal) + unlk a6 + rts +#endif /* L_ledf2 */ + +| The comments above about __eqdf2, et. al., also apply to __eqsf2, +| et. al., except that the latter call __cmpsf2 rather than __cmpdf2. + +#ifdef L_eqsf2 + .text + FUNC(__eqsf2) + .globl SYM (__eqsf2) +SYM (__eqsf2): + link a6,IMM (0) + pea 1 + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpsf2_internal) + unlk a6 + rts +#endif /* L_eqsf2 */ + +#ifdef L_nesf2 + .text + FUNC(__nesf2) + .globl SYM (__nesf2) +SYM (__nesf2): + link a6,IMM (0) + pea 1 + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpsf2_internal) + unlk a6 + rts +#endif /* L_nesf2 */ + +#ifdef L_gtsf2 + .text + FUNC(__gtsf2) + .globl SYM (__gtsf2) +SYM (__gtsf2): + link a6,IMM (0) + pea -1 + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpsf2_internal) + unlk a6 + rts +#endif /* L_gtsf2 */ + +#ifdef L_gesf2 + .text + FUNC(__gesf2) + .globl SYM (__gesf2) +SYM (__gesf2): + link a6,IMM (0) + pea -1 + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpsf2_internal) + unlk a6 + rts +#endif /* L_gesf2 */ + +#ifdef L_ltsf2 + .text + FUNC(__ltsf2) + .globl SYM (__ltsf2) +SYM (__ltsf2): + link a6,IMM (0) + pea 1 + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpsf2_internal) + unlk a6 + rts +#endif /* L_ltsf2 */ + +#ifdef L_lesf2 + .text + FUNC(__lesf2) + .globl SYM (__lesf2) +SYM (__lesf2): + link a6,IMM (0) + pea 1 + movl a6@(12),sp@- + movl a6@(8),sp@- + PICCALL SYM (__cmpsf2_internal) + unlk a6 + rts +#endif /* L_lesf2 */ + +#if defined (__ELF__) && defined (__linux__) + /* Make stack non-executable for ELF linux targets. */ + .section .note.GNU-stack,"",@progbits +#endif diff --git a/libgcc/config/m68k/t-floatlib b/libgcc/config/m68k/t-floatlib new file mode 100644 index 00000000000..4160eb9f537 --- /dev/null +++ b/libgcc/config/m68k/t-floatlib @@ -0,0 +1,5 @@ +LIB1ASMSRC = m68k/lb1sf68.S +LIB1ASMFUNCS = _mulsi3 _udivsi3 _divsi3 _umodsi3 _modsi3 \ + _double _float _floatex \ + _eqdf2 _nedf2 _gtdf2 _gedf2 _ltdf2 _ledf2 \ + _eqsf2 _nesf2 _gtsf2 _gesf2 _ltsf2 _lesf2 diff --git a/libgcc/config/mcore/lib1funcs.S b/libgcc/config/mcore/lib1funcs.S new file mode 100644 index 00000000000..701762f2a3c --- /dev/null +++ b/libgcc/config/mcore/lib1funcs.S @@ -0,0 +1,303 @@ +/* libgcc routines for the MCore. + Copyright (C) 1993, 1999, 2000, 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#define CONCAT1(a, b) CONCAT2(a, b) +#define CONCAT2(a, b) a ## b + +/* Use the right prefix for global labels. */ + +#define SYM(x) CONCAT1 (__, x) + +#ifdef __ELF__ +#define TYPE(x) .type SYM (x),@function +#define SIZE(x) .size SYM (x), . - SYM (x) +#else +#define TYPE(x) +#define SIZE(x) +#endif + +.macro FUNC_START name + .text + .globl SYM (\name) + TYPE (\name) +SYM (\name): +.endm + +.macro FUNC_END name + SIZE (\name) +.endm + +#ifdef L_udivsi3 +FUNC_START udiv32 +FUNC_START udivsi32 + + movi r1,0 // r1-r2 form 64 bit dividend + movi r4,1 // r4 is quotient (1 for a sentinel) + + cmpnei r3,0 // look for 0 divisor + bt 9f + trap 3 // divide by 0 +9: + // control iterations; skip across high order 0 bits in dividend + mov r7,r2 + cmpnei r7,0 + bt 8f + movi r2,0 // 0 dividend + jmp r15 // quick return +8: + ff1 r7 // figure distance to skip + lsl r4,r7 // move the sentinel along (with 0's behind) + lsl r2,r7 // and the low 32 bits of numerator + +// appears to be wrong... +// tested out incorrectly in our OS work... +// mov r7,r3 // looking at divisor +// ff1 r7 // I can move 32-r7 more bits to left. +// addi r7,1 // ok, one short of that... +// mov r1,r2 +// lsr r1,r7 // bits that came from low order... +// rsubi r7,31 // r7 == "32-n" == LEFT distance +// addi r7,1 // this is (32-n) +// lsl r4,r7 // fixes the high 32 (quotient) +// lsl r2,r7 +// cmpnei r4,0 +// bf 4f // the sentinel went away... + + // run the remaining bits + +1: lslc r2,1 // 1 bit left shift of r1-r2 + addc r1,r1 + cmphs r1,r3 // upper 32 of dividend >= divisor? + bf 2f + sub r1,r3 // if yes, subtract divisor +2: addc r4,r4 // shift by 1 and count subtracts + bf 1b // if sentinel falls out of quotient, stop + +4: mov r2,r4 // return quotient + mov r3,r1 // and piggyback the remainder + jmp r15 +FUNC_END udiv32 +FUNC_END udivsi32 +#endif + +#ifdef L_umodsi3 +FUNC_START urem32 +FUNC_START umodsi3 + movi r1,0 // r1-r2 form 64 bit dividend + movi r4,1 // r4 is quotient (1 for a sentinel) + cmpnei r3,0 // look for 0 divisor + bt 9f + trap 3 // divide by 0 +9: + // control iterations; skip across high order 0 bits in dividend + mov r7,r2 + cmpnei r7,0 + bt 8f + movi r2,0 // 0 dividend + jmp r15 // quick return +8: + ff1 r7 // figure distance to skip + lsl r4,r7 // move the sentinel along (with 0's behind) + lsl r2,r7 // and the low 32 bits of numerator + +1: lslc r2,1 // 1 bit left shift of r1-r2 + addc r1,r1 + cmphs r1,r3 // upper 32 of dividend >= divisor? + bf 2f + sub r1,r3 // if yes, subtract divisor +2: addc r4,r4 // shift by 1 and count subtracts + bf 1b // if sentinel falls out of quotient, stop + mov r2,r1 // return remainder + jmp r15 +FUNC_END urem32 +FUNC_END umodsi3 +#endif + +#ifdef L_divsi3 +FUNC_START div32 +FUNC_START divsi3 + mov r5,r2 // calc sign of quotient + xor r5,r3 + abs r2 // do unsigned divide + abs r3 + movi r1,0 // r1-r2 form 64 bit dividend + movi r4,1 // r4 is quotient (1 for a sentinel) + cmpnei r3,0 // look for 0 divisor + bt 9f + trap 3 // divide by 0 +9: + // control iterations; skip across high order 0 bits in dividend + mov r7,r2 + cmpnei r7,0 + bt 8f + movi r2,0 // 0 dividend + jmp r15 // quick return +8: + ff1 r7 // figure distance to skip + lsl r4,r7 // move the sentinel along (with 0's behind) + lsl r2,r7 // and the low 32 bits of numerator + +// tested out incorrectly in our OS work... +// mov r7,r3 // looking at divisor +// ff1 r7 // I can move 32-r7 more bits to left. +// addi r7,1 // ok, one short of that... +// mov r1,r2 +// lsr r1,r7 // bits that came from low order... +// rsubi r7,31 // r7 == "32-n" == LEFT distance +// addi r7,1 // this is (32-n) +// lsl r4,r7 // fixes the high 32 (quotient) +// lsl r2,r7 +// cmpnei r4,0 +// bf 4f // the sentinel went away... + + // run the remaining bits +1: lslc r2,1 // 1 bit left shift of r1-r2 + addc r1,r1 + cmphs r1,r3 // upper 32 of dividend >= divisor? + bf 2f + sub r1,r3 // if yes, subtract divisor +2: addc r4,r4 // shift by 1 and count subtracts + bf 1b // if sentinel falls out of quotient, stop + +4: mov r2,r4 // return quotient + mov r3,r1 // piggyback the remainder + btsti r5,31 // after adjusting for sign + bf 3f + rsubi r2,0 + rsubi r3,0 +3: jmp r15 +FUNC_END div32 +FUNC_END divsi3 +#endif + +#ifdef L_modsi3 +FUNC_START rem32 +FUNC_START modsi3 + mov r5,r2 // calc sign of remainder + abs r2 // do unsigned divide + abs r3 + movi r1,0 // r1-r2 form 64 bit dividend + movi r4,1 // r4 is quotient (1 for a sentinel) + cmpnei r3,0 // look for 0 divisor + bt 9f + trap 3 // divide by 0 +9: + // control iterations; skip across high order 0 bits in dividend + mov r7,r2 + cmpnei r7,0 + bt 8f + movi r2,0 // 0 dividend + jmp r15 // quick return +8: + ff1 r7 // figure distance to skip + lsl r4,r7 // move the sentinel along (with 0's behind) + lsl r2,r7 // and the low 32 bits of numerator + +1: lslc r2,1 // 1 bit left shift of r1-r2 + addc r1,r1 + cmphs r1,r3 // upper 32 of dividend >= divisor? + bf 2f + sub r1,r3 // if yes, subtract divisor +2: addc r4,r4 // shift by 1 and count subtracts + bf 1b // if sentinel falls out of quotient, stop + mov r2,r1 // return remainder + btsti r5,31 // after adjusting for sign + bf 3f + rsubi r2,0 +3: jmp r15 +FUNC_END rem32 +FUNC_END modsi3 +#endif + + +/* GCC expects that {__eq,__ne,__gt,__ge,__le,__lt}{df2,sf2} + will behave as __cmpdf2. So, we stub the implementations to + jump on to __cmpdf2 and __cmpsf2. + + All of these shortcircuit the return path so that __cmp{sd}f2 + will go directly back to the caller. */ + +.macro COMPARE_DF_JUMP name + .import SYM (cmpdf2) +FUNC_START \name + jmpi SYM (cmpdf2) +FUNC_END \name +.endm + +#ifdef L_eqdf2 +COMPARE_DF_JUMP eqdf2 +#endif /* L_eqdf2 */ + +#ifdef L_nedf2 +COMPARE_DF_JUMP nedf2 +#endif /* L_nedf2 */ + +#ifdef L_gtdf2 +COMPARE_DF_JUMP gtdf2 +#endif /* L_gtdf2 */ + +#ifdef L_gedf2 +COMPARE_DF_JUMP gedf2 +#endif /* L_gedf2 */ + +#ifdef L_ltdf2 +COMPARE_DF_JUMP ltdf2 +#endif /* L_ltdf2 */ + +#ifdef L_ledf2 +COMPARE_DF_JUMP ledf2 +#endif /* L_ledf2 */ + +/* SINGLE PRECISION FLOATING POINT STUBS */ + +.macro COMPARE_SF_JUMP name + .import SYM (cmpsf2) +FUNC_START \name + jmpi SYM (cmpsf2) +FUNC_END \name +.endm + +#ifdef L_eqsf2 +COMPARE_SF_JUMP eqsf2 +#endif /* L_eqsf2 */ + +#ifdef L_nesf2 +COMPARE_SF_JUMP nesf2 +#endif /* L_nesf2 */ + +#ifdef L_gtsf2 +COMPARE_SF_JUMP gtsf2 +#endif /* L_gtsf2 */ + +#ifdef L_gesf2 +COMPARE_SF_JUMP __gesf2 +#endif /* L_gesf2 */ + +#ifdef L_ltsf2 +COMPARE_SF_JUMP __ltsf2 +#endif /* L_ltsf2 */ + +#ifdef L_lesf2 +COMPARE_SF_JUMP lesf2 +#endif /* L_lesf2 */ diff --git a/libgcc/config/mcore/t-mcore b/libgcc/config/mcore/t-mcore new file mode 100644 index 00000000000..19c4c15cd0b --- /dev/null +++ b/libgcc/config/mcore/t-mcore @@ -0,0 +1,2 @@ +LIB1ASMSRC = mcore/lib1funcs.S +LIB1ASMFUNCS = _divsi3 _udivsi3 _modsi3 _umodsi3 diff --git a/libgcc/config/mep/lib1funcs.S b/libgcc/config/mep/lib1funcs.S new file mode 100644 index 00000000000..0a18913f927 --- /dev/null +++ b/libgcc/config/mep/lib1funcs.S @@ -0,0 +1,125 @@ +/* libgcc routines for Toshiba Media Processor. + Copyright (C) 2001, 2002, 2005, 2009 Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#define SAVEALL \ + add3 $sp, $sp, -16*4 ; \ + sw $0, ($sp) ; \ + sw $1, 4($sp) ; \ + sw $2, 8($sp) ; \ + sw $3, 12($sp) ; \ + sw $4, 16($sp) ; \ + sw $5, 20($sp) ; \ + sw $6, 24($sp) ; \ + sw $7, 28($sp) ; \ + sw $8, 32($sp) ; \ + sw $9, 36($sp) ; \ + sw $10, 40($sp) ; \ + sw $11, 44($sp) ; \ + sw $12, 48($sp) ; \ + sw $13, 52($sp) ; \ + sw $14, 56($sp) ; \ + ldc $5, $lp ; \ + add $5, 3 ; \ + mov $6, -4 ; \ + and $5, $6 + +#define RESTOREALL \ + stc $5, $lp ; \ + lw $14, 56($sp) ; \ + lw $13, 52($sp) ; \ + lw $12, 48($sp) ; \ + lw $11, 44($sp) ; \ + lw $10, 40($sp) ; \ + lw $9, 36($sp) ; \ + lw $8, 32($sp) ; \ + lw $7, 28($sp) ; \ + lw $6, 24($sp) ; \ + lw $5, 20($sp) ; \ + lw $4, 16($sp) ; \ + lw $3, 12($sp) ; \ + lw $2, 8($sp) ; \ + lw $1, 4($sp) ; \ + lw $0, ($sp) ; \ + add3 $sp, $sp, 16*4 ; \ + ret + +#ifdef L_mep_profile + .text + .global __mep_mcount +__mep_mcount: + SAVEALL + ldc $1, $lp + mov $2, $0 + bsr __mep_mcount_2 + RESTOREALL +#endif + +#ifdef L_mep_bb_init_trace + .text + .global __mep_bb_init_trace_func +__mep_bb_init_trace_func: + SAVEALL + lw $1, ($5) + lw $2, 4($5) + add $5, 8 + bsr __bb_init_trace_func + RESTOREALL +#endif + +#ifdef L_mep_bb_init + .text + .global __mep_bb_init_func +__mep_bb_init_func: + SAVEALL + lw $1, ($5) + add $5, 4 + bsr __bb_init_func + RESTOREALL +#endif + +#ifdef L_mep_bb_trace + .text + .global __mep_bb_trace_func +__mep_bb_trace_func: + SAVEALL + movu $3, __bb + lw $1, ($5) + sw $1, ($3) + lw $2, 4($5) + sw $2, 4($3) + add $5, 8 + bsr __bb_trace_func + RESTOREALL +#endif + +#ifdef L_mep_bb_increment + .text + .global __mep_bb_increment_func +__mep_bb_increment_func: + SAVEALL + lw $1, ($5) + lw $0, ($1) + add $0, 1 + sw $0, ($1) + add $5, 4 + RESTOREALL +#endif diff --git a/libgcc/config/mep/t-mep b/libgcc/config/mep/t-mep index 36e6f5dc771..d1fb094a41e 100644 --- a/libgcc/config/mep/t-mep +++ b/libgcc/config/mep/t-mep @@ -1,2 +1,11 @@ +# profiling support +LIB1ASMSRC = mep/lib1funcs.S + +LIB1ASMFUNCS = _mep_profile \ + _mep_bb_init_trace \ + _mep_bb_init \ + _mep_bb_trace \ + _mep_bb_increment + # Use -O0 instead of -O2 so we don't get complex relocations CRTSTUFF_CFLAGS += -O0 diff --git a/libgcc/config/mips/mips16.S b/libgcc/config/mips/mips16.S new file mode 100644 index 00000000000..ec331b5f65e --- /dev/null +++ b/libgcc/config/mips/mips16.S @@ -0,0 +1,712 @@ +/* mips16 floating point support code + Copyright (C) 1996, 1997, 1998, 2008, 2009, 2010 + Free Software Foundation, Inc. + Contributed by Cygnus Support + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +/* This file contains mips16 floating point support functions. These + functions are called by mips16 code to handle floating point when + -msoft-float is not used. They accept the arguments and return + values using the soft-float calling convention, but do the actual + operation using the hard floating point instructions. */ + +#if defined _MIPS_SIM && (_MIPS_SIM == _ABIO32 || _MIPS_SIM == _ABIO64) + +/* This file contains 32-bit assembly code. */ + .set nomips16 + +/* Start a function. */ + +#define STARTFN(NAME) .globl NAME; .ent NAME; NAME: + +/* Finish a function. */ + +#define ENDFN(NAME) .end NAME + +/* ARG1 + The FPR that holds the first floating-point argument. + + ARG2 + The FPR that holds the second floating-point argument. + + RET + The FPR that holds a floating-point return value. */ + +#define RET $f0 +#define ARG1 $f12 +#ifdef __mips64 +#define ARG2 $f13 +#else +#define ARG2 $f14 +#endif + +/* Set 64-bit register GPR so that its high 32 bits contain HIGH_FPR + and so that its low 32 bits contain LOW_FPR. */ +#define MERGE_GPRf(GPR, HIGH_FPR, LOW_FPR) \ + .set noat; \ + mfc1 $1, LOW_FPR; \ + mfc1 GPR, HIGH_FPR; \ + dsll $1, $1, 32; \ + dsll GPR, GPR, 32; \ + dsrl $1, $1, 32; \ + or GPR, GPR, $1; \ + .set at + +/* Move the high 32 bits of GPR to HIGH_FPR and the low 32 bits of + GPR to LOW_FPR. */ +#define MERGE_GPRt(GPR, HIGH_FPR, LOW_FPR) \ + .set noat; \ + dsrl $1, GPR, 32; \ + mtc1 GPR, LOW_FPR; \ + mtc1 $1, HIGH_FPR; \ + .set at + +/* Jump to T, and use "OPCODE, OP2" to implement a delayed move. */ +#define DELAYt(T, OPCODE, OP2) \ + .set noreorder; \ + jr T; \ + OPCODE, OP2; \ + .set reorder + +/* Use "OPCODE. OP2" and jump to T. */ +#define DELAYf(T, OPCODE, OP2) OPCODE, OP2; jr T + +/* MOVE_SF_BYTE0(D) + Move the first single-precision floating-point argument between + GPRs and FPRs. + + MOVE_SI_BYTE0(D) + Likewise the first single-precision integer argument. + + MOVE_SF_BYTE4(D) + Move the second single-precision floating-point argument between + GPRs and FPRs, given that the first argument occupies 4 bytes. + + MOVE_SF_BYTE8(D) + Move the second single-precision floating-point argument between + GPRs and FPRs, given that the first argument occupies 8 bytes. + + MOVE_DF_BYTE0(D) + Move the first double-precision floating-point argument between + GPRs and FPRs. + + MOVE_DF_BYTE8(D) + Likewise the second double-precision floating-point argument. + + MOVE_SF_RET(D, T) + Likewise a single-precision floating-point return value, + then jump to T. + + MOVE_SC_RET(D, T) + Likewise a complex single-precision floating-point return value. + + MOVE_DF_RET(D, T) + Likewise a double-precision floating-point return value. + + MOVE_DC_RET(D, T) + Likewise a complex double-precision floating-point return value. + + MOVE_SI_RET(D, T) + Likewise a single-precision integer return value. + + The D argument is "t" to move to FPRs and "f" to move from FPRs. + The return macros may assume that the target of the jump does not + use a floating-point register. */ + +#define MOVE_SF_RET(D, T) DELAY##D (T, m##D##c1 $2,$f0) +#define MOVE_SI_RET(D, T) DELAY##D (T, m##D##c1 $2,$f0) + +#if defined(__mips64) && defined(__MIPSEB__) +#define MOVE_SC_RET(D, T) MERGE_GPR##D ($2, $f0, $f1); jr T +#elif defined(__mips64) +/* The high 32 bits of $2 correspond to the second word in memory; + i.e. the imaginary part. */ +#define MOVE_SC_RET(D, T) MERGE_GPR##D ($2, $f1, $f0); jr T +#elif __mips_fpr == 64 +#define MOVE_SC_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##c1 $3,$f1) +#else +#define MOVE_SC_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##c1 $3,$f2) +#endif + +#if defined(__mips64) +#define MOVE_SF_BYTE0(D) m##D##c1 $4,$f12 +#define MOVE_SF_BYTE4(D) m##D##c1 $5,$f13 +#define MOVE_SF_BYTE8(D) m##D##c1 $5,$f13 +#else +#define MOVE_SF_BYTE0(D) m##D##c1 $4,$f12 +#define MOVE_SF_BYTE4(D) m##D##c1 $5,$f14 +#define MOVE_SF_BYTE8(D) m##D##c1 $6,$f14 +#endif +#define MOVE_SI_BYTE0(D) MOVE_SF_BYTE0(D) + +#if defined(__mips64) +#define MOVE_DF_BYTE0(D) dm##D##c1 $4,$f12 +#define MOVE_DF_BYTE8(D) dm##D##c1 $5,$f13 +#define MOVE_DF_RET(D, T) DELAY##D (T, dm##D##c1 $2,$f0) +#define MOVE_DC_RET(D, T) dm##D##c1 $3,$f1; MOVE_DF_RET (D, T) +#elif __mips_fpr == 64 && defined(__MIPSEB__) +#define MOVE_DF_BYTE0(D) m##D##c1 $5,$f12; m##D##hc1 $4,$f12 +#define MOVE_DF_BYTE8(D) m##D##c1 $7,$f14; m##D##hc1 $6,$f14 +#define MOVE_DF_RET(D, T) m##D##c1 $3,$f0; DELAY##D (T, m##D##hc1 $2,$f0) +#define MOVE_DC_RET(D, T) m##D##c1 $5,$f1; m##D##hc1 $4,$f1; MOVE_DF_RET (D, T) +#elif __mips_fpr == 64 +#define MOVE_DF_BYTE0(D) m##D##c1 $4,$f12; m##D##hc1 $5,$f12 +#define MOVE_DF_BYTE8(D) m##D##c1 $6,$f14; m##D##hc1 $7,$f14 +#define MOVE_DF_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##hc1 $3,$f0) +#define MOVE_DC_RET(D, T) m##D##c1 $4,$f1; m##D##hc1 $5,$f1; MOVE_DF_RET (D, T) +#elif defined(__MIPSEB__) +/* FPRs are little-endian. */ +#define MOVE_DF_BYTE0(D) m##D##c1 $4,$f13; m##D##c1 $5,$f12 +#define MOVE_DF_BYTE8(D) m##D##c1 $6,$f15; m##D##c1 $7,$f14 +#define MOVE_DF_RET(D, T) m##D##c1 $2,$f1; DELAY##D (T, m##D##c1 $3,$f0) +#define MOVE_DC_RET(D, T) m##D##c1 $4,$f3; m##D##c1 $5,$f2; MOVE_DF_RET (D, T) +#else +#define MOVE_DF_BYTE0(D) m##D##c1 $4,$f12; m##D##c1 $5,$f13 +#define MOVE_DF_BYTE8(D) m##D##c1 $6,$f14; m##D##c1 $7,$f15 +#define MOVE_DF_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##c1 $3,$f1) +#define MOVE_DC_RET(D, T) m##D##c1 $4,$f2; m##D##c1 $5,$f3; MOVE_DF_RET (D, T) +#endif + +/* Single-precision math. */ + +/* Define a function NAME that loads two single-precision values, + performs FPU operation OPCODE on them, and returns the single- + precision result. */ + +#define OPSF3(NAME, OPCODE) \ +STARTFN (NAME); \ + MOVE_SF_BYTE0 (t); \ + MOVE_SF_BYTE4 (t); \ + OPCODE RET,ARG1,ARG2; \ + MOVE_SF_RET (f, $31); \ + ENDFN (NAME) + +#ifdef L_m16addsf3 +OPSF3 (__mips16_addsf3, add.s) +#endif +#ifdef L_m16subsf3 +OPSF3 (__mips16_subsf3, sub.s) +#endif +#ifdef L_m16mulsf3 +OPSF3 (__mips16_mulsf3, mul.s) +#endif +#ifdef L_m16divsf3 +OPSF3 (__mips16_divsf3, div.s) +#endif + +/* Define a function NAME that loads a single-precision value, + performs FPU operation OPCODE on it, and returns the single- + precision result. */ + +#define OPSF2(NAME, OPCODE) \ +STARTFN (NAME); \ + MOVE_SF_BYTE0 (t); \ + OPCODE RET,ARG1; \ + MOVE_SF_RET (f, $31); \ + ENDFN (NAME) + +#ifdef L_m16negsf2 +OPSF2 (__mips16_negsf2, neg.s) +#endif +#ifdef L_m16abssf2 +OPSF2 (__mips16_abssf2, abs.s) +#endif + +/* Single-precision comparisons. */ + +/* Define a function NAME that loads two single-precision values, + performs floating point comparison OPCODE, and returns TRUE or + FALSE depending on the result. */ + +#define CMPSF(NAME, OPCODE, TRUE, FALSE) \ +STARTFN (NAME); \ + MOVE_SF_BYTE0 (t); \ + MOVE_SF_BYTE4 (t); \ + OPCODE ARG1,ARG2; \ + li $2,TRUE; \ + bc1t 1f; \ + li $2,FALSE; \ +1:; \ + j $31; \ + ENDFN (NAME) + +/* Like CMPSF, but reverse the comparison operands. */ + +#define REVCMPSF(NAME, OPCODE, TRUE, FALSE) \ +STARTFN (NAME); \ + MOVE_SF_BYTE0 (t); \ + MOVE_SF_BYTE4 (t); \ + OPCODE ARG2,ARG1; \ + li $2,TRUE; \ + bc1t 1f; \ + li $2,FALSE; \ +1:; \ + j $31; \ + ENDFN (NAME) + +#ifdef L_m16eqsf2 +CMPSF (__mips16_eqsf2, c.eq.s, 0, 1) +#endif +#ifdef L_m16nesf2 +CMPSF (__mips16_nesf2, c.eq.s, 0, 1) +#endif +#ifdef L_m16gtsf2 +REVCMPSF (__mips16_gtsf2, c.lt.s, 1, 0) +#endif +#ifdef L_m16gesf2 +REVCMPSF (__mips16_gesf2, c.le.s, 0, -1) +#endif +#ifdef L_m16lesf2 +CMPSF (__mips16_lesf2, c.le.s, 0, 1) +#endif +#ifdef L_m16ltsf2 +CMPSF (__mips16_ltsf2, c.lt.s, -1, 0) +#endif +#ifdef L_m16unordsf2 +CMPSF(__mips16_unordsf2, c.un.s, 1, 0) +#endif + + +/* Single-precision conversions. */ + +#ifdef L_m16fltsisf +STARTFN (__mips16_floatsisf) + MOVE_SF_BYTE0 (t) + cvt.s.w RET,ARG1 + MOVE_SF_RET (f, $31) + ENDFN (__mips16_floatsisf) +#endif + +#ifdef L_m16fltunsisf +STARTFN (__mips16_floatunsisf) + .set noreorder + bltz $4,1f + MOVE_SF_BYTE0 (t) + .set reorder + cvt.s.w RET,ARG1 + MOVE_SF_RET (f, $31) +1: + and $2,$4,1 + srl $3,$4,1 + or $2,$2,$3 + mtc1 $2,RET + cvt.s.w RET,RET + add.s RET,RET,RET + MOVE_SF_RET (f, $31) + ENDFN (__mips16_floatunsisf) +#endif + +#ifdef L_m16fix_truncsfsi +STARTFN (__mips16_fix_truncsfsi) + MOVE_SF_BYTE0 (t) + trunc.w.s RET,ARG1,$4 + MOVE_SI_RET (f, $31) + ENDFN (__mips16_fix_truncsfsi) +#endif + +#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT) + +/* Double-precision math. */ + +/* Define a function NAME that loads two double-precision values, + performs FPU operation OPCODE on them, and returns the double- + precision result. */ + +#define OPDF3(NAME, OPCODE) \ +STARTFN (NAME); \ + MOVE_DF_BYTE0 (t); \ + MOVE_DF_BYTE8 (t); \ + OPCODE RET,ARG1,ARG2; \ + MOVE_DF_RET (f, $31); \ + ENDFN (NAME) + +#ifdef L_m16adddf3 +OPDF3 (__mips16_adddf3, add.d) +#endif +#ifdef L_m16subdf3 +OPDF3 (__mips16_subdf3, sub.d) +#endif +#ifdef L_m16muldf3 +OPDF3 (__mips16_muldf3, mul.d) +#endif +#ifdef L_m16divdf3 +OPDF3 (__mips16_divdf3, div.d) +#endif + +/* Define a function NAME that loads a double-precision value, + performs FPU operation OPCODE on it, and returns the double- + precision result. */ + +#define OPDF2(NAME, OPCODE) \ +STARTFN (NAME); \ + MOVE_DF_BYTE0 (t); \ + OPCODE RET,ARG1; \ + MOVE_DF_RET (f, $31); \ + ENDFN (NAME) + +#ifdef L_m16negdf2 +OPDF2 (__mips16_negdf2, neg.d) +#endif +#ifdef L_m16absdf2 +OPDF2 (__mips16_absdf2, abs.d) +#endif + +/* Conversions between single and double precision. */ + +#ifdef L_m16extsfdf2 +STARTFN (__mips16_extendsfdf2) + MOVE_SF_BYTE0 (t) + cvt.d.s RET,ARG1 + MOVE_DF_RET (f, $31) + ENDFN (__mips16_extendsfdf2) +#endif + +#ifdef L_m16trdfsf2 +STARTFN (__mips16_truncdfsf2) + MOVE_DF_BYTE0 (t) + cvt.s.d RET,ARG1 + MOVE_SF_RET (f, $31) + ENDFN (__mips16_truncdfsf2) +#endif + +/* Double-precision comparisons. */ + +/* Define a function NAME that loads two double-precision values, + performs floating point comparison OPCODE, and returns TRUE or + FALSE depending on the result. */ + +#define CMPDF(NAME, OPCODE, TRUE, FALSE) \ +STARTFN (NAME); \ + MOVE_DF_BYTE0 (t); \ + MOVE_DF_BYTE8 (t); \ + OPCODE ARG1,ARG2; \ + li $2,TRUE; \ + bc1t 1f; \ + li $2,FALSE; \ +1:; \ + j $31; \ + ENDFN (NAME) + +/* Like CMPDF, but reverse the comparison operands. */ + +#define REVCMPDF(NAME, OPCODE, TRUE, FALSE) \ +STARTFN (NAME); \ + MOVE_DF_BYTE0 (t); \ + MOVE_DF_BYTE8 (t); \ + OPCODE ARG2,ARG1; \ + li $2,TRUE; \ + bc1t 1f; \ + li $2,FALSE; \ +1:; \ + j $31; \ + ENDFN (NAME) + +#ifdef L_m16eqdf2 +CMPDF (__mips16_eqdf2, c.eq.d, 0, 1) +#endif +#ifdef L_m16nedf2 +CMPDF (__mips16_nedf2, c.eq.d, 0, 1) +#endif +#ifdef L_m16gtdf2 +REVCMPDF (__mips16_gtdf2, c.lt.d, 1, 0) +#endif +#ifdef L_m16gedf2 +REVCMPDF (__mips16_gedf2, c.le.d, 0, -1) +#endif +#ifdef L_m16ledf2 +CMPDF (__mips16_ledf2, c.le.d, 0, 1) +#endif +#ifdef L_m16ltdf2 +CMPDF (__mips16_ltdf2, c.lt.d, -1, 0) +#endif +#ifdef L_m16unorddf2 +CMPDF(__mips16_unorddf2, c.un.d, 1, 0) +#endif + +/* Double-precision conversions. */ + +#ifdef L_m16fltsidf +STARTFN (__mips16_floatsidf) + MOVE_SI_BYTE0 (t) + cvt.d.w RET,ARG1 + MOVE_DF_RET (f, $31) + ENDFN (__mips16_floatsidf) +#endif + +#ifdef L_m16fltunsidf +STARTFN (__mips16_floatunsidf) + MOVE_SI_BYTE0 (t) + cvt.d.w RET,ARG1 + bgez $4,1f + li.d ARG1, 4.294967296e+9 + add.d RET, RET, ARG1 +1: MOVE_DF_RET (f, $31) + ENDFN (__mips16_floatunsidf) +#endif + +#ifdef L_m16fix_truncdfsi +STARTFN (__mips16_fix_truncdfsi) + MOVE_DF_BYTE0 (t) + trunc.w.d RET,ARG1,$4 + MOVE_SI_RET (f, $31) + ENDFN (__mips16_fix_truncdfsi) +#endif +#endif /* !__mips_single_float */ + +/* Define a function NAME that moves a return value of mode MODE from + FPRs to GPRs. */ + +#define RET_FUNCTION(NAME, MODE) \ +STARTFN (NAME); \ + MOVE_##MODE##_RET (t, $31); \ + ENDFN (NAME) + +#ifdef L_m16retsf +RET_FUNCTION (__mips16_ret_sf, SF) +#endif + +#ifdef L_m16retsc +RET_FUNCTION (__mips16_ret_sc, SC) +#endif + +#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT) +#ifdef L_m16retdf +RET_FUNCTION (__mips16_ret_df, DF) +#endif + +#ifdef L_m16retdc +RET_FUNCTION (__mips16_ret_dc, DC) +#endif +#endif /* !__mips_single_float */ + +/* STUB_ARGS_X copies the arguments from GPRs to FPRs for argument + code X. X is calculated as ARG1 + ARG2 * 4, where ARG1 and ARG2 + classify the first and second arguments as follows: + + 1: a single-precision argument + 2: a double-precision argument + 0: no argument, or not one of the above. */ + +#define STUB_ARGS_0 /* () */ +#define STUB_ARGS_1 MOVE_SF_BYTE0 (t) /* (sf) */ +#define STUB_ARGS_5 MOVE_SF_BYTE0 (t); MOVE_SF_BYTE4 (t) /* (sf, sf) */ +#define STUB_ARGS_9 MOVE_SF_BYTE0 (t); MOVE_DF_BYTE8 (t) /* (sf, df) */ +#define STUB_ARGS_2 MOVE_DF_BYTE0 (t) /* (df) */ +#define STUB_ARGS_6 MOVE_DF_BYTE0 (t); MOVE_SF_BYTE8 (t) /* (df, sf) */ +#define STUB_ARGS_10 MOVE_DF_BYTE0 (t); MOVE_DF_BYTE8 (t) /* (df, df) */ + +/* These functions are used by 16-bit code when calling via a function + pointer. They must copy the floating point arguments from the GPRs + to FPRs and then call function $2. */ + +#define CALL_STUB_NO_RET(NAME, CODE) \ +STARTFN (NAME); \ + STUB_ARGS_##CODE; \ + .set noreorder; \ + jr $2; \ + move $25,$2; \ + .set reorder; \ + ENDFN (NAME) + +#ifdef L_m16stub1 +CALL_STUB_NO_RET (__mips16_call_stub_1, 1) +#endif + +#ifdef L_m16stub5 +CALL_STUB_NO_RET (__mips16_call_stub_5, 5) +#endif + +#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT) + +#ifdef L_m16stub2 +CALL_STUB_NO_RET (__mips16_call_stub_2, 2) +#endif + +#ifdef L_m16stub6 +CALL_STUB_NO_RET (__mips16_call_stub_6, 6) +#endif + +#ifdef L_m16stub9 +CALL_STUB_NO_RET (__mips16_call_stub_9, 9) +#endif + +#ifdef L_m16stub10 +CALL_STUB_NO_RET (__mips16_call_stub_10, 10) +#endif +#endif /* !__mips_single_float */ + +/* Now we have the same set of functions, except that this time the + function being called returns an SFmode, SCmode, DFmode or DCmode + value; we need to instantiate a set for each case. The calling + function will arrange to preserve $18, so these functions are free + to use it to hold the return address. + + Note that we do not know whether the function we are calling is 16 + bit or 32 bit. However, it does not matter, because 16-bit + functions always return floating point values in both the gp and + the fp regs. It would be possible to check whether the function + being called is 16 bits, in which case the copy is unnecessary; + however, it's faster to always do the copy. */ + +#define CALL_STUB_RET(NAME, CODE, MODE) \ +STARTFN (NAME); \ + move $18,$31; \ + STUB_ARGS_##CODE; \ + .set noreorder; \ + jalr $2; \ + move $25,$2; \ + .set reorder; \ + MOVE_##MODE##_RET (f, $18); \ + ENDFN (NAME) + +/* First, instantiate the single-float set. */ + +#ifdef L_m16stubsf0 +CALL_STUB_RET (__mips16_call_stub_sf_0, 0, SF) +#endif + +#ifdef L_m16stubsf1 +CALL_STUB_RET (__mips16_call_stub_sf_1, 1, SF) +#endif + +#ifdef L_m16stubsf5 +CALL_STUB_RET (__mips16_call_stub_sf_5, 5, SF) +#endif + +#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT) +#ifdef L_m16stubsf2 +CALL_STUB_RET (__mips16_call_stub_sf_2, 2, SF) +#endif + +#ifdef L_m16stubsf6 +CALL_STUB_RET (__mips16_call_stub_sf_6, 6, SF) +#endif + +#ifdef L_m16stubsf9 +CALL_STUB_RET (__mips16_call_stub_sf_9, 9, SF) +#endif + +#ifdef L_m16stubsf10 +CALL_STUB_RET (__mips16_call_stub_sf_10, 10, SF) +#endif +#endif /* !__mips_single_float */ + + +/* Now we have the same set of functions again, except that this time + the function being called returns an DFmode value. */ + +#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT) +#ifdef L_m16stubdf0 +CALL_STUB_RET (__mips16_call_stub_df_0, 0, DF) +#endif + +#ifdef L_m16stubdf1 +CALL_STUB_RET (__mips16_call_stub_df_1, 1, DF) +#endif + +#ifdef L_m16stubdf5 +CALL_STUB_RET (__mips16_call_stub_df_5, 5, DF) +#endif + +#ifdef L_m16stubdf2 +CALL_STUB_RET (__mips16_call_stub_df_2, 2, DF) +#endif + +#ifdef L_m16stubdf6 +CALL_STUB_RET (__mips16_call_stub_df_6, 6, DF) +#endif + +#ifdef L_m16stubdf9 +CALL_STUB_RET (__mips16_call_stub_df_9, 9, DF) +#endif + +#ifdef L_m16stubdf10 +CALL_STUB_RET (__mips16_call_stub_df_10, 10, DF) +#endif +#endif /* !__mips_single_float */ + + +/* Ho hum. Here we have the same set of functions again, this time + for when the function being called returns an SCmode value. */ + +#ifdef L_m16stubsc0 +CALL_STUB_RET (__mips16_call_stub_sc_0, 0, SC) +#endif + +#ifdef L_m16stubsc1 +CALL_STUB_RET (__mips16_call_stub_sc_1, 1, SC) +#endif + +#ifdef L_m16stubsc5 +CALL_STUB_RET (__mips16_call_stub_sc_5, 5, SC) +#endif + +#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT) +#ifdef L_m16stubsc2 +CALL_STUB_RET (__mips16_call_stub_sc_2, 2, SC) +#endif + +#ifdef L_m16stubsc6 +CALL_STUB_RET (__mips16_call_stub_sc_6, 6, SC) +#endif + +#ifdef L_m16stubsc9 +CALL_STUB_RET (__mips16_call_stub_sc_9, 9, SC) +#endif + +#ifdef L_m16stubsc10 +CALL_STUB_RET (__mips16_call_stub_sc_10, 10, SC) +#endif +#endif /* !__mips_single_float */ + + +/* Finally, another set of functions for DCmode. */ + +#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT) +#ifdef L_m16stubdc0 +CALL_STUB_RET (__mips16_call_stub_dc_0, 0, DC) +#endif + +#ifdef L_m16stubdc1 +CALL_STUB_RET (__mips16_call_stub_dc_1, 1, DC) +#endif + +#ifdef L_m16stubdc5 +CALL_STUB_RET (__mips16_call_stub_dc_5, 5, DC) +#endif + +#ifdef L_m16stubdc2 +CALL_STUB_RET (__mips16_call_stub_dc_2, 2, DC) +#endif + +#ifdef L_m16stubdc6 +CALL_STUB_RET (__mips16_call_stub_dc_6, 6, DC) +#endif + +#ifdef L_m16stubdc9 +CALL_STUB_RET (__mips16_call_stub_dc_9, 9, DC) +#endif + +#ifdef L_m16stubdc10 +CALL_STUB_RET (__mips16_call_stub_dc_10, 10, DC) +#endif +#endif /* !__mips_single_float */ +#endif diff --git a/libgcc/config/mips/t-mips16 b/libgcc/config/mips/t-mips16 index 46c7472f5f6..5553ed76e2d 100644 --- a/libgcc/config/mips/t-mips16 +++ b/libgcc/config/mips/t-mips16 @@ -1,3 +1,43 @@ +# Copyright (C) 2007, 2008, 2011 Free Software Foundation, Inc. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GCC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# <http://www.gnu.org/licenses/>. + +LIB1ASMSRC = mips/mips16.S +LIB1ASMFUNCS = _m16addsf3 _m16subsf3 _m16mulsf3 _m16divsf3 \ + _m16eqsf2 _m16nesf2 _m16gtsf2 _m16gesf2 _m16lesf2 _m16ltsf2 \ + _m16unordsf2 \ + _m16fltsisf _m16fix_truncsfsi _m16fltunsisf \ + _m16adddf3 _m16subdf3 _m16muldf3 _m16divdf3 \ + _m16extsfdf2 _m16trdfsf2 \ + _m16eqdf2 _m16nedf2 _m16gtdf2 _m16gedf2 _m16ledf2 _m16ltdf2 \ + _m16unorddf2 \ + _m16fltsidf _m16fix_truncdfsi _m16fltunsidf \ + _m16retsf _m16retdf \ + _m16retsc _m16retdc \ + _m16stub1 _m16stub2 _m16stub5 _m16stub6 _m16stub9 _m16stub10 \ + _m16stubsf0 _m16stubsf1 _m16stubsf2 _m16stubsf5 _m16stubsf6 \ + _m16stubsf9 _m16stubsf10 \ + _m16stubdf0 _m16stubdf1 _m16stubdf2 _m16stubdf5 _m16stubdf6 \ + _m16stubdf9 _m16stubdf10 \ + _m16stubsc0 _m16stubsc1 _m16stubsc2 _m16stubsc5 _m16stubsc6 \ + _m16stubsc9 _m16stubsc10 \ + _m16stubdc0 _m16stubdc1 _m16stubdc2 _m16stubdc5 _m16stubdc6 \ + _m16stubdc9 _m16stubdc10 + SYNC = yes SYNC_CFLAGS = -mno-mips16 diff --git a/libgcc/config/pa/milli64.S b/libgcc/config/pa/milli64.S new file mode 100644 index 00000000000..2e9c4f741b6 --- /dev/null +++ b/libgcc/config/pa/milli64.S @@ -0,0 +1,2134 @@ +/* 32 and 64-bit millicode, original author Hewlett-Packard + adapted for gcc by Paul Bame <bame@debian.org> + and Alan Modra <alan@linuxcare.com.au>. + + Copyright 2001, 2002, 2003, 2007, 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#ifdef pa64 + .level 2.0w +#endif + +/* Hardware General Registers. */ +r0: .reg %r0 +r1: .reg %r1 +r2: .reg %r2 +r3: .reg %r3 +r4: .reg %r4 +r5: .reg %r5 +r6: .reg %r6 +r7: .reg %r7 +r8: .reg %r8 +r9: .reg %r9 +r10: .reg %r10 +r11: .reg %r11 +r12: .reg %r12 +r13: .reg %r13 +r14: .reg %r14 +r15: .reg %r15 +r16: .reg %r16 +r17: .reg %r17 +r18: .reg %r18 +r19: .reg %r19 +r20: .reg %r20 +r21: .reg %r21 +r22: .reg %r22 +r23: .reg %r23 +r24: .reg %r24 +r25: .reg %r25 +r26: .reg %r26 +r27: .reg %r27 +r28: .reg %r28 +r29: .reg %r29 +r30: .reg %r30 +r31: .reg %r31 + +/* Hardware Space Registers. */ +sr0: .reg %sr0 +sr1: .reg %sr1 +sr2: .reg %sr2 +sr3: .reg %sr3 +sr4: .reg %sr4 +sr5: .reg %sr5 +sr6: .reg %sr6 +sr7: .reg %sr7 + +/* Hardware Floating Point Registers. */ +fr0: .reg %fr0 +fr1: .reg %fr1 +fr2: .reg %fr2 +fr3: .reg %fr3 +fr4: .reg %fr4 +fr5: .reg %fr5 +fr6: .reg %fr6 +fr7: .reg %fr7 +fr8: .reg %fr8 +fr9: .reg %fr9 +fr10: .reg %fr10 +fr11: .reg %fr11 +fr12: .reg %fr12 +fr13: .reg %fr13 +fr14: .reg %fr14 +fr15: .reg %fr15 + +/* Hardware Control Registers. */ +cr11: .reg %cr11 +sar: .reg %cr11 /* Shift Amount Register */ + +/* Software Architecture General Registers. */ +rp: .reg r2 /* return pointer */ +#ifdef pa64 +mrp: .reg r2 /* millicode return pointer */ +#else +mrp: .reg r31 /* millicode return pointer */ +#endif +ret0: .reg r28 /* return value */ +ret1: .reg r29 /* return value (high part of double) */ +sp: .reg r30 /* stack pointer */ +dp: .reg r27 /* data pointer */ +arg0: .reg r26 /* argument */ +arg1: .reg r25 /* argument or high part of double argument */ +arg2: .reg r24 /* argument */ +arg3: .reg r23 /* argument or high part of double argument */ + +/* Software Architecture Space Registers. */ +/* sr0 ; return link from BLE */ +sret: .reg sr1 /* return value */ +sarg: .reg sr1 /* argument */ +/* sr4 ; PC SPACE tracker */ +/* sr5 ; process private data */ + +/* Frame Offsets (millicode convention!) Used when calling other + millicode routines. Stack unwinding is dependent upon these + definitions. */ +r31_slot: .equ -20 /* "current RP" slot */ +sr0_slot: .equ -16 /* "static link" slot */ +#if defined(pa64) +mrp_slot: .equ -16 /* "current RP" slot */ +psp_slot: .equ -8 /* "previous SP" slot */ +#else +mrp_slot: .equ -20 /* "current RP" slot (replacing "r31_slot") */ +#endif + + +#define DEFINE(name,value)name: .EQU value +#define RDEFINE(name,value)name: .REG value +#ifdef milliext +#define MILLI_BE(lbl) BE lbl(sr7,r0) +#define MILLI_BEN(lbl) BE,n lbl(sr7,r0) +#define MILLI_BLE(lbl) BLE lbl(sr7,r0) +#define MILLI_BLEN(lbl) BLE,n lbl(sr7,r0) +#define MILLIRETN BE,n 0(sr0,mrp) +#define MILLIRET BE 0(sr0,mrp) +#define MILLI_RETN BE,n 0(sr0,mrp) +#define MILLI_RET BE 0(sr0,mrp) +#else +#define MILLI_BE(lbl) B lbl +#define MILLI_BEN(lbl) B,n lbl +#define MILLI_BLE(lbl) BL lbl,mrp +#define MILLI_BLEN(lbl) BL,n lbl,mrp +#define MILLIRETN BV,n 0(mrp) +#define MILLIRET BV 0(mrp) +#define MILLI_RETN BV,n 0(mrp) +#define MILLI_RET BV 0(mrp) +#endif + +#ifdef __STDC__ +#define CAT(a,b) a##b +#else +#define CAT(a,b) a/**/b +#endif + +#ifdef ELF +#define SUBSPA_MILLI .section .text +#define SUBSPA_MILLI_DIV .section .text.div,"ax",@progbits! .align 16 +#define SUBSPA_MILLI_MUL .section .text.mul,"ax",@progbits! .align 16 +#define ATTR_MILLI +#define SUBSPA_DATA .section .data +#define ATTR_DATA +#define GLOBAL $global$ +#define GSYM(sym) !sym: +#define LSYM(sym) !CAT(.L,sym:) +#define LREF(sym) CAT(.L,sym) + +#else + +#ifdef coff +/* This used to be .milli but since link32 places different named + sections in different segments millicode ends up a long ways away + from .text (1meg?). This way they will be a lot closer. + + The SUBSPA_MILLI_* specify locality sets for certain millicode + modules in order to ensure that modules that call one another are + placed close together. Without locality sets this is unlikely to + happen because of the Dynamite linker library search algorithm. We + want these modules close together so that short calls always reach + (we don't want to require long calls or use long call stubs). */ + +#define SUBSPA_MILLI .subspa .text +#define SUBSPA_MILLI_DIV .subspa .text$dv,align=16 +#define SUBSPA_MILLI_MUL .subspa .text$mu,align=16 +#define ATTR_MILLI .attr code,read,execute +#define SUBSPA_DATA .subspa .data +#define ATTR_DATA .attr init_data,read,write +#define GLOBAL _gp +#else +#define SUBSPA_MILLI .subspa $MILLICODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,SORT=8 +#define SUBSPA_MILLI_DIV SUBSPA_MILLI +#define SUBSPA_MILLI_MUL SUBSPA_MILLI +#define ATTR_MILLI +#define SUBSPA_DATA .subspa $BSS$,quad=1,align=8,access=0x1f,sort=80,zero +#define ATTR_DATA +#define GLOBAL $global$ +#endif +#define SPACE_DATA .space $PRIVATE$,spnum=1,sort=16 + +#define GSYM(sym) !sym +#define LSYM(sym) !CAT(L$,sym) +#define LREF(sym) CAT(L$,sym) +#endif + +#ifdef L_dyncall + SUBSPA_MILLI + ATTR_DATA +GSYM($$dyncall) + .export $$dyncall,millicode + .proc + .callinfo millicode + .entry + bb,>=,n %r22,30,LREF(1) ; branch if not plabel address + depi 0,31,2,%r22 ; clear the two least significant bits + ldw 4(%r22),%r19 ; load new LTP value + ldw 0(%r22),%r22 ; load address of target +LSYM(1) +#ifdef LINUX + bv %r0(%r22) ; branch to the real target +#else + ldsid (%sr0,%r22),%r1 ; get the "space ident" selected by r22 + mtsp %r1,%sr0 ; move that space identifier into sr0 + be 0(%sr0,%r22) ; branch to the real target +#endif + stw %r2,-24(%r30) ; save return address into frame marker + .exit + .procend +#endif + +#ifdef L_divI +/* ROUTINES: $$divI, $$divoI + + Single precision divide for signed binary integers. + + The quotient is truncated towards zero. + The sign of the quotient is the XOR of the signs of the dividend and + divisor. + Divide by zero is trapped. + Divide of -2**31 by -1 is trapped for $$divoI but not for $$divI. + + INPUT REGISTERS: + . arg0 == dividend + . arg1 == divisor + . mrp == return pc + . sr0 == return space when called externally + + OUTPUT REGISTERS: + . arg0 = undefined + . arg1 = undefined + . ret1 = quotient + + OTHER REGISTERS AFFECTED: + . r1 = undefined + + SIDE EFFECTS: + . Causes a trap under the following conditions: + . divisor is zero (traps with ADDIT,= 0,25,0) + . dividend==-2**31 and divisor==-1 and routine is $$divoI + . (traps with ADDO 26,25,0) + . Changes memory at the following places: + . NONE + + PERMISSIBLE CONTEXT: + . Unwindable. + . Suitable for internal or external millicode. + . Assumes the special millicode register conventions. + + DISCUSSION: + . Branchs to other millicode routines using BE + . $$div_# for # being 2,3,4,5,6,7,8,9,10,12,14,15 + . + . For selected divisors, calls a divide by constant routine written by + . Karl Pettis. Eligible divisors are 1..15 excluding 11 and 13. + . + . The only overflow case is -2**31 divided by -1. + . Both routines return -2**31 but only $$divoI traps. */ + +RDEFINE(temp,r1) +RDEFINE(retreg,ret1) /* r29 */ +RDEFINE(temp1,arg0) + SUBSPA_MILLI_DIV + ATTR_MILLI + .import $$divI_2,millicode + .import $$divI_3,millicode + .import $$divI_4,millicode + .import $$divI_5,millicode + .import $$divI_6,millicode + .import $$divI_7,millicode + .import $$divI_8,millicode + .import $$divI_9,millicode + .import $$divI_10,millicode + .import $$divI_12,millicode + .import $$divI_14,millicode + .import $$divI_15,millicode + .export $$divI,millicode + .export $$divoI,millicode + .proc + .callinfo millicode + .entry +GSYM($$divoI) + comib,=,n -1,arg1,LREF(negative1) /* when divisor == -1 */ +GSYM($$divI) + ldo -1(arg1),temp /* is there at most one bit set ? */ + and,<> arg1,temp,r0 /* if not, don't use power of 2 divide */ + addi,> 0,arg1,r0 /* if divisor > 0, use power of 2 divide */ + b,n LREF(neg_denom) +LSYM(pow2) + addi,>= 0,arg0,retreg /* if numerator is negative, add the */ + add arg0,temp,retreg /* (denominaotr -1) to correct for shifts */ + extru,= arg1,15,16,temp /* test denominator with 0xffff0000 */ + extrs retreg,15,16,retreg /* retreg = retreg >> 16 */ + or arg1,temp,arg1 /* arg1 = arg1 | (arg1 >> 16) */ + ldi 0xcc,temp1 /* setup 0xcc in temp1 */ + extru,= arg1,23,8,temp /* test denominator with 0xff00 */ + extrs retreg,23,24,retreg /* retreg = retreg >> 8 */ + or arg1,temp,arg1 /* arg1 = arg1 | (arg1 >> 8) */ + ldi 0xaa,temp /* setup 0xaa in temp */ + extru,= arg1,27,4,r0 /* test denominator with 0xf0 */ + extrs retreg,27,28,retreg /* retreg = retreg >> 4 */ + and,= arg1,temp1,r0 /* test denominator with 0xcc */ + extrs retreg,29,30,retreg /* retreg = retreg >> 2 */ + and,= arg1,temp,r0 /* test denominator with 0xaa */ + extrs retreg,30,31,retreg /* retreg = retreg >> 1 */ + MILLIRETN +LSYM(neg_denom) + addi,< 0,arg1,r0 /* if arg1 >= 0, it's not power of 2 */ + b,n LREF(regular_seq) + sub r0,arg1,temp /* make denominator positive */ + comb,=,n arg1,temp,LREF(regular_seq) /* test against 0x80000000 and 0 */ + ldo -1(temp),retreg /* is there at most one bit set ? */ + and,= temp,retreg,r0 /* if so, the denominator is power of 2 */ + b,n LREF(regular_seq) + sub r0,arg0,retreg /* negate numerator */ + comb,=,n arg0,retreg,LREF(regular_seq) /* test against 0x80000000 */ + copy retreg,arg0 /* set up arg0, arg1 and temp */ + copy temp,arg1 /* before branching to pow2 */ + b LREF(pow2) + ldo -1(arg1),temp +LSYM(regular_seq) + comib,>>=,n 15,arg1,LREF(small_divisor) + add,>= 0,arg0,retreg /* move dividend, if retreg < 0, */ +LSYM(normal) + subi 0,retreg,retreg /* make it positive */ + sub 0,arg1,temp /* clear carry, */ + /* negate the divisor */ + ds 0,temp,0 /* set V-bit to the comple- */ + /* ment of the divisor sign */ + add retreg,retreg,retreg /* shift msb bit into carry */ + ds r0,arg1,temp /* 1st divide step, if no carry */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 2nd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 3rd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 4th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 5th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 6th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 7th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 8th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 9th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 10th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 11th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 12th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 13th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 14th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 15th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 16th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 17th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 18th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 19th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 20th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 21st divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 22nd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 23rd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 24th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 25th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 26th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 27th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 28th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 29th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 30th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 31st divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 32nd divide step, */ + addc retreg,retreg,retreg /* shift last retreg bit into retreg */ + xor,>= arg0,arg1,0 /* get correct sign of quotient */ + sub 0,retreg,retreg /* based on operand signs */ + MILLIRETN + nop + +LSYM(small_divisor) + +#if defined(pa64) +/* Clear the upper 32 bits of the arg1 register. We are working with */ +/* small divisors (and 32-bit integers) We must not be mislead */ +/* by "1" bits left in the upper 32 bits. */ + depd %r0,31,32,%r25 +#endif + blr,n arg1,r0 + nop +/* table for divisor == 0,1, ... ,15 */ + addit,= 0,arg1,r0 /* trap if divisor == 0 */ + nop + MILLIRET /* divisor == 1 */ + copy arg0,retreg + MILLI_BEN($$divI_2) /* divisor == 2 */ + nop + MILLI_BEN($$divI_3) /* divisor == 3 */ + nop + MILLI_BEN($$divI_4) /* divisor == 4 */ + nop + MILLI_BEN($$divI_5) /* divisor == 5 */ + nop + MILLI_BEN($$divI_6) /* divisor == 6 */ + nop + MILLI_BEN($$divI_7) /* divisor == 7 */ + nop + MILLI_BEN($$divI_8) /* divisor == 8 */ + nop + MILLI_BEN($$divI_9) /* divisor == 9 */ + nop + MILLI_BEN($$divI_10) /* divisor == 10 */ + nop + b LREF(normal) /* divisor == 11 */ + add,>= 0,arg0,retreg + MILLI_BEN($$divI_12) /* divisor == 12 */ + nop + b LREF(normal) /* divisor == 13 */ + add,>= 0,arg0,retreg + MILLI_BEN($$divI_14) /* divisor == 14 */ + nop + MILLI_BEN($$divI_15) /* divisor == 15 */ + nop + +LSYM(negative1) + sub 0,arg0,retreg /* result is negation of dividend */ + MILLIRET + addo arg0,arg1,r0 /* trap iff dividend==0x80000000 && divisor==-1 */ + .exit + .procend + .end +#endif + +#ifdef L_divU +/* ROUTINE: $$divU + . + . Single precision divide for unsigned integers. + . + . Quotient is truncated towards zero. + . Traps on divide by zero. + + INPUT REGISTERS: + . arg0 == dividend + . arg1 == divisor + . mrp == return pc + . sr0 == return space when called externally + + OUTPUT REGISTERS: + . arg0 = undefined + . arg1 = undefined + . ret1 = quotient + + OTHER REGISTERS AFFECTED: + . r1 = undefined + + SIDE EFFECTS: + . Causes a trap under the following conditions: + . divisor is zero + . Changes memory at the following places: + . NONE + + PERMISSIBLE CONTEXT: + . Unwindable. + . Does not create a stack frame. + . Suitable for internal or external millicode. + . Assumes the special millicode register conventions. + + DISCUSSION: + . Branchs to other millicode routines using BE: + . $$divU_# for 3,5,6,7,9,10,12,14,15 + . + . For selected small divisors calls the special divide by constant + . routines written by Karl Pettis. These are: 3,5,6,7,9,10,12,14,15. */ + +RDEFINE(temp,r1) +RDEFINE(retreg,ret1) /* r29 */ +RDEFINE(temp1,arg0) + SUBSPA_MILLI_DIV + ATTR_MILLI + .export $$divU,millicode + .import $$divU_3,millicode + .import $$divU_5,millicode + .import $$divU_6,millicode + .import $$divU_7,millicode + .import $$divU_9,millicode + .import $$divU_10,millicode + .import $$divU_12,millicode + .import $$divU_14,millicode + .import $$divU_15,millicode + .proc + .callinfo millicode + .entry +GSYM($$divU) +/* The subtract is not nullified since it does no harm and can be used + by the two cases that branch back to "normal". */ + ldo -1(arg1),temp /* is there at most one bit set ? */ + and,= arg1,temp,r0 /* if so, denominator is power of 2 */ + b LREF(regular_seq) + addit,= 0,arg1,0 /* trap for zero dvr */ + copy arg0,retreg + extru,= arg1,15,16,temp /* test denominator with 0xffff0000 */ + extru retreg,15,16,retreg /* retreg = retreg >> 16 */ + or arg1,temp,arg1 /* arg1 = arg1 | (arg1 >> 16) */ + ldi 0xcc,temp1 /* setup 0xcc in temp1 */ + extru,= arg1,23,8,temp /* test denominator with 0xff00 */ + extru retreg,23,24,retreg /* retreg = retreg >> 8 */ + or arg1,temp,arg1 /* arg1 = arg1 | (arg1 >> 8) */ + ldi 0xaa,temp /* setup 0xaa in temp */ + extru,= arg1,27,4,r0 /* test denominator with 0xf0 */ + extru retreg,27,28,retreg /* retreg = retreg >> 4 */ + and,= arg1,temp1,r0 /* test denominator with 0xcc */ + extru retreg,29,30,retreg /* retreg = retreg >> 2 */ + and,= arg1,temp,r0 /* test denominator with 0xaa */ + extru retreg,30,31,retreg /* retreg = retreg >> 1 */ + MILLIRETN + nop +LSYM(regular_seq) + comib,>= 15,arg1,LREF(special_divisor) + subi 0,arg1,temp /* clear carry, negate the divisor */ + ds r0,temp,r0 /* set V-bit to 1 */ +LSYM(normal) + add arg0,arg0,retreg /* shift msb bit into carry */ + ds r0,arg1,temp /* 1st divide step, if no carry */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 2nd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 3rd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 4th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 5th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 6th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 7th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 8th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 9th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 10th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 11th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 12th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 13th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 14th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 15th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 16th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 17th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 18th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 19th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 20th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 21st divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 22nd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 23rd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 24th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 25th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 26th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 27th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 28th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 29th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 30th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 31st divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds temp,arg1,temp /* 32nd divide step, */ + MILLIRET + addc retreg,retreg,retreg /* shift last retreg bit into retreg */ + +/* Handle the cases where divisor is a small constant or has high bit on. */ +LSYM(special_divisor) +/* blr arg1,r0 */ +/* comib,>,n 0,arg1,LREF(big_divisor) ; nullify previous instruction */ + +/* Pratap 8/13/90. The 815 Stirling chip set has a bug that prevents us from + generating such a blr, comib sequence. A problem in nullification. So I + rewrote this code. */ + +#if defined(pa64) +/* Clear the upper 32 bits of the arg1 register. We are working with + small divisors (and 32-bit unsigned integers) We must not be mislead + by "1" bits left in the upper 32 bits. */ + depd %r0,31,32,%r25 +#endif + comib,> 0,arg1,LREF(big_divisor) + nop + blr arg1,r0 + nop + +LSYM(zero_divisor) /* this label is here to provide external visibility */ + addit,= 0,arg1,0 /* trap for zero dvr */ + nop + MILLIRET /* divisor == 1 */ + copy arg0,retreg + MILLIRET /* divisor == 2 */ + extru arg0,30,31,retreg + MILLI_BEN($$divU_3) /* divisor == 3 */ + nop + MILLIRET /* divisor == 4 */ + extru arg0,29,30,retreg + MILLI_BEN($$divU_5) /* divisor == 5 */ + nop + MILLI_BEN($$divU_6) /* divisor == 6 */ + nop + MILLI_BEN($$divU_7) /* divisor == 7 */ + nop + MILLIRET /* divisor == 8 */ + extru arg0,28,29,retreg + MILLI_BEN($$divU_9) /* divisor == 9 */ + nop + MILLI_BEN($$divU_10) /* divisor == 10 */ + nop + b LREF(normal) /* divisor == 11 */ + ds r0,temp,r0 /* set V-bit to 1 */ + MILLI_BEN($$divU_12) /* divisor == 12 */ + nop + b LREF(normal) /* divisor == 13 */ + ds r0,temp,r0 /* set V-bit to 1 */ + MILLI_BEN($$divU_14) /* divisor == 14 */ + nop + MILLI_BEN($$divU_15) /* divisor == 15 */ + nop + +/* Handle the case where the high bit is on in the divisor. + Compute: if( dividend>=divisor) quotient=1; else quotient=0; + Note: dividend>==divisor iff dividend-divisor does not borrow + and not borrow iff carry. */ +LSYM(big_divisor) + sub arg0,arg1,r0 + MILLIRET + addc r0,r0,retreg + .exit + .procend + .end +#endif + +#ifdef L_remI +/* ROUTINE: $$remI + + DESCRIPTION: + . $$remI returns the remainder of the division of two signed 32-bit + . integers. The sign of the remainder is the same as the sign of + . the dividend. + + + INPUT REGISTERS: + . arg0 == dividend + . arg1 == divisor + . mrp == return pc + . sr0 == return space when called externally + + OUTPUT REGISTERS: + . arg0 = destroyed + . arg1 = destroyed + . ret1 = remainder + + OTHER REGISTERS AFFECTED: + . r1 = undefined + + SIDE EFFECTS: + . Causes a trap under the following conditions: DIVIDE BY ZERO + . Changes memory at the following places: NONE + + PERMISSIBLE CONTEXT: + . Unwindable + . Does not create a stack frame + . Is usable for internal or external microcode + + DISCUSSION: + . Calls other millicode routines via mrp: NONE + . Calls other millicode routines: NONE */ + +RDEFINE(tmp,r1) +RDEFINE(retreg,ret1) + + SUBSPA_MILLI + ATTR_MILLI + .proc + .callinfo millicode + .entry +GSYM($$remI) +GSYM($$remoI) + .export $$remI,MILLICODE + .export $$remoI,MILLICODE + ldo -1(arg1),tmp /* is there at most one bit set ? */ + and,<> arg1,tmp,r0 /* if not, don't use power of 2 */ + addi,> 0,arg1,r0 /* if denominator > 0, use power */ + /* of 2 */ + b,n LREF(neg_denom) +LSYM(pow2) + comb,>,n 0,arg0,LREF(neg_num) /* is numerator < 0 ? */ + and arg0,tmp,retreg /* get the result */ + MILLIRETN +LSYM(neg_num) + subi 0,arg0,arg0 /* negate numerator */ + and arg0,tmp,retreg /* get the result */ + subi 0,retreg,retreg /* negate result */ + MILLIRETN +LSYM(neg_denom) + addi,< 0,arg1,r0 /* if arg1 >= 0, it's not power */ + /* of 2 */ + b,n LREF(regular_seq) + sub r0,arg1,tmp /* make denominator positive */ + comb,=,n arg1,tmp,LREF(regular_seq) /* test against 0x80000000 and 0 */ + ldo -1(tmp),retreg /* is there at most one bit set ? */ + and,= tmp,retreg,r0 /* if not, go to regular_seq */ + b,n LREF(regular_seq) + comb,>,n 0,arg0,LREF(neg_num_2) /* if arg0 < 0, negate it */ + and arg0,retreg,retreg + MILLIRETN +LSYM(neg_num_2) + subi 0,arg0,tmp /* test against 0x80000000 */ + and tmp,retreg,retreg + subi 0,retreg,retreg + MILLIRETN +LSYM(regular_seq) + addit,= 0,arg1,0 /* trap if div by zero */ + add,>= 0,arg0,retreg /* move dividend, if retreg < 0, */ + sub 0,retreg,retreg /* make it positive */ + sub 0,arg1, tmp /* clear carry, */ + /* negate the divisor */ + ds 0, tmp,0 /* set V-bit to the comple- */ + /* ment of the divisor sign */ + or 0,0, tmp /* clear tmp */ + add retreg,retreg,retreg /* shift msb bit into carry */ + ds tmp,arg1, tmp /* 1st divide step, if no carry */ + /* out, msb of quotient = 0 */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ +LSYM(t1) + ds tmp,arg1, tmp /* 2nd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 3rd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 4th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 5th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 6th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 7th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 8th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 9th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 10th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 11th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 12th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 13th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 14th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 15th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 16th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 17th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 18th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 19th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 20th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 21st divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 22nd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 23rd divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 24th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 25th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 26th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 27th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 28th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 29th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 30th divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 31st divide step */ + addc retreg,retreg,retreg /* shift retreg with/into carry */ + ds tmp,arg1, tmp /* 32nd divide step, */ + addc retreg,retreg,retreg /* shift last bit into retreg */ + movb,>=,n tmp,retreg,LREF(finish) /* branch if pos. tmp */ + add,< arg1,0,0 /* if arg1 > 0, add arg1 */ + add,tr tmp,arg1,retreg /* for correcting remainder tmp */ + sub tmp,arg1,retreg /* else add absolute value arg1 */ +LSYM(finish) + add,>= arg0,0,0 /* set sign of remainder */ + sub 0,retreg,retreg /* to sign of dividend */ + MILLIRET + nop + .exit + .procend +#ifdef milliext + .origin 0x00000200 +#endif + .end +#endif + +#ifdef L_remU +/* ROUTINE: $$remU + . Single precision divide for remainder with unsigned binary integers. + . + . The remainder must be dividend-(dividend/divisor)*divisor. + . Divide by zero is trapped. + + INPUT REGISTERS: + . arg0 == dividend + . arg1 == divisor + . mrp == return pc + . sr0 == return space when called externally + + OUTPUT REGISTERS: + . arg0 = undefined + . arg1 = undefined + . ret1 = remainder + + OTHER REGISTERS AFFECTED: + . r1 = undefined + + SIDE EFFECTS: + . Causes a trap under the following conditions: DIVIDE BY ZERO + . Changes memory at the following places: NONE + + PERMISSIBLE CONTEXT: + . Unwindable. + . Does not create a stack frame. + . Suitable for internal or external millicode. + . Assumes the special millicode register conventions. + + DISCUSSION: + . Calls other millicode routines using mrp: NONE + . Calls other millicode routines: NONE */ + + +RDEFINE(temp,r1) +RDEFINE(rmndr,ret1) /* r29 */ + SUBSPA_MILLI + ATTR_MILLI + .export $$remU,millicode + .proc + .callinfo millicode + .entry +GSYM($$remU) + ldo -1(arg1),temp /* is there at most one bit set ? */ + and,= arg1,temp,r0 /* if not, don't use power of 2 */ + b LREF(regular_seq) + addit,= 0,arg1,r0 /* trap on div by zero */ + and arg0,temp,rmndr /* get the result for power of 2 */ + MILLIRETN +LSYM(regular_seq) + comib,>=,n 0,arg1,LREF(special_case) + subi 0,arg1,rmndr /* clear carry, negate the divisor */ + ds r0,rmndr,r0 /* set V-bit to 1 */ + add arg0,arg0,temp /* shift msb bit into carry */ + ds r0,arg1,rmndr /* 1st divide step, if no carry */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 2nd divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 3rd divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 4th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 5th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 6th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 7th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 8th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 9th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 10th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 11th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 12th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 13th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 14th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 15th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 16th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 17th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 18th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 19th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 20th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 21st divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 22nd divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 23rd divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 24th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 25th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 26th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 27th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 28th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 29th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 30th divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 31st divide step */ + addc temp,temp,temp /* shift temp with/into carry */ + ds rmndr,arg1,rmndr /* 32nd divide step, */ + comiclr,<= 0,rmndr,r0 + add rmndr,arg1,rmndr /* correction */ + MILLIRETN + nop + +/* Putting >= on the last DS and deleting COMICLR does not work! */ +LSYM(special_case) + sub,>>= arg0,arg1,rmndr + copy arg0,rmndr + MILLIRETN + nop + .exit + .procend + .end +#endif + +#ifdef L_div_const +/* ROUTINE: $$divI_2 + . $$divI_3 $$divU_3 + . $$divI_4 + . $$divI_5 $$divU_5 + . $$divI_6 $$divU_6 + . $$divI_7 $$divU_7 + . $$divI_8 + . $$divI_9 $$divU_9 + . $$divI_10 $$divU_10 + . + . $$divI_12 $$divU_12 + . + . $$divI_14 $$divU_14 + . $$divI_15 $$divU_15 + . $$divI_16 + . $$divI_17 $$divU_17 + . + . Divide by selected constants for single precision binary integers. + + INPUT REGISTERS: + . arg0 == dividend + . mrp == return pc + . sr0 == return space when called externally + + OUTPUT REGISTERS: + . arg0 = undefined + . arg1 = undefined + . ret1 = quotient + + OTHER REGISTERS AFFECTED: + . r1 = undefined + + SIDE EFFECTS: + . Causes a trap under the following conditions: NONE + . Changes memory at the following places: NONE + + PERMISSIBLE CONTEXT: + . Unwindable. + . Does not create a stack frame. + . Suitable for internal or external millicode. + . Assumes the special millicode register conventions. + + DISCUSSION: + . Calls other millicode routines using mrp: NONE + . Calls other millicode routines: NONE */ + + +/* TRUNCATED DIVISION BY SMALL INTEGERS + + We are interested in q(x) = floor(x/y), where x >= 0 and y > 0 + (with y fixed). + + Let a = floor(z/y), for some choice of z. Note that z will be + chosen so that division by z is cheap. + + Let r be the remainder(z/y). In other words, r = z - ay. + + Now, our method is to choose a value for b such that + + q'(x) = floor((ax+b)/z) + + is equal to q(x) over as large a range of x as possible. If the + two are equal over a sufficiently large range, and if it is easy to + form the product (ax), and it is easy to divide by z, then we can + perform the division much faster than the general division algorithm. + + So, we want the following to be true: + + . For x in the following range: + . + . ky <= x < (k+1)y + . + . implies that + . + . k <= (ax+b)/z < (k+1) + + We want to determine b such that this is true for all k in the + range {0..K} for some maximum K. + + Since (ax+b) is an increasing function of x, we can take each + bound separately to determine the "best" value for b. + + (ax+b)/z < (k+1) implies + + (a((k+1)y-1)+b < (k+1)z implies + + b < a + (k+1)(z-ay) implies + + b < a + (k+1)r + + This needs to be true for all k in the range {0..K}. In + particular, it is true for k = 0 and this leads to a maximum + acceptable value for b. + + b < a+r or b <= a+r-1 + + Taking the other bound, we have + + k <= (ax+b)/z implies + + k <= (aky+b)/z implies + + k(z-ay) <= b implies + + kr <= b + + Clearly, the largest range for k will be achieved by maximizing b, + when r is not zero. When r is zero, then the simplest choice for b + is 0. When r is not 0, set + + . b = a+r-1 + + Now, by construction, q'(x) = floor((ax+b)/z) = q(x) = floor(x/y) + for all x in the range: + + . 0 <= x < (K+1)y + + We need to determine what K is. Of our two bounds, + + . b < a+(k+1)r is satisfied for all k >= 0, by construction. + + The other bound is + + . kr <= b + + This is always true if r = 0. If r is not 0 (the usual case), then + K = floor((a+r-1)/r), is the maximum value for k. + + Therefore, the formula q'(x) = floor((ax+b)/z) yields the correct + answer for q(x) = floor(x/y) when x is in the range + + (0,(K+1)y-1) K = floor((a+r-1)/r) + + To be most useful, we want (K+1)y-1 = (max x) >= 2**32-1 so that + the formula for q'(x) yields the correct value of q(x) for all x + representable by a single word in HPPA. + + We are also constrained in that computing the product (ax), adding + b, and dividing by z must all be done quickly, otherwise we will be + better off going through the general algorithm using the DS + instruction, which uses approximately 70 cycles. + + For each y, there is a choice of z which satisfies the constraints + for (K+1)y >= 2**32. We may not, however, be able to satisfy the + timing constraints for arbitrary y. It seems that z being equal to + a power of 2 or a power of 2 minus 1 is as good as we can do, since + it minimizes the time to do division by z. We want the choice of z + to also result in a value for (a) that minimizes the computation of + the product (ax). This is best achieved if (a) has a regular bit + pattern (so the multiplication can be done with shifts and adds). + The value of (a) also needs to be less than 2**32 so the product is + always guaranteed to fit in 2 words. + + In actual practice, the following should be done: + + 1) For negative x, you should take the absolute value and remember + . the fact so that the result can be negated. This obviously does + . not apply in the unsigned case. + 2) For even y, you should factor out the power of 2 that divides y + . and divide x by it. You can then proceed by dividing by the + . odd factor of y. + + Here is a table of some odd values of y, and corresponding choices + for z which are "good". + + y z r a (hex) max x (hex) + + 3 2**32 1 55555555 100000001 + 5 2**32 1 33333333 100000003 + 7 2**24-1 0 249249 (infinite) + 9 2**24-1 0 1c71c7 (infinite) + 11 2**20-1 0 1745d (infinite) + 13 2**24-1 0 13b13b (infinite) + 15 2**32 1 11111111 10000000d + 17 2**32 1 f0f0f0f 10000000f + + If r is 1, then b = a+r-1 = a. This simplifies the computation + of (ax+b), since you can compute (x+1)(a) instead. If r is 0, + then b = 0 is ok to use which simplifies (ax+b). + + The bit patterns for 55555555, 33333333, and 11111111 are obviously + very regular. The bit patterns for the other values of a above are: + + y (hex) (binary) + + 7 249249 001001001001001001001001 << regular >> + 9 1c71c7 000111000111000111000111 << regular >> + 11 1745d 000000010111010001011101 << irregular >> + 13 13b13b 000100111011000100111011 << irregular >> + + The bit patterns for (a) corresponding to (y) of 11 and 13 may be + too irregular to warrant using this method. + + When z is a power of 2 minus 1, then the division by z is slightly + more complicated, involving an iterative solution. + + The code presented here solves division by 1 through 17, except for + 11 and 13. There are algorithms for both signed and unsigned + quantities given. + + TIMINGS (cycles) + + divisor positive negative unsigned + + . 1 2 2 2 + . 2 4 4 2 + . 3 19 21 19 + . 4 4 4 2 + . 5 18 22 19 + . 6 19 22 19 + . 8 4 4 2 + . 10 18 19 17 + . 12 18 20 18 + . 15 16 18 16 + . 16 4 4 2 + . 17 16 18 16 + + Now, the algorithm for 7, 9, and 14 is an iterative one. That is, + a loop body is executed until the tentative quotient is 0. The + number of times the loop body is executed varies depending on the + dividend, but is never more than two times. If the dividend is + less than the divisor, then the loop body is not executed at all. + Each iteration adds 4 cycles to the timings. + + divisor positive negative unsigned + + . 7 19+4n 20+4n 20+4n n = number of iterations + . 9 21+4n 22+4n 21+4n + . 14 21+4n 22+4n 20+4n + + To give an idea of how the number of iterations varies, here is a + table of dividend versus number of iterations when dividing by 7. + + smallest largest required + dividend dividend iterations + + . 0 6 0 + . 7 0x6ffffff 1 + 0x1000006 0xffffffff 2 + + There is some overlap in the range of numbers requiring 1 and 2 + iterations. */ + +RDEFINE(t2,r1) +RDEFINE(x2,arg0) /* r26 */ +RDEFINE(t1,arg1) /* r25 */ +RDEFINE(x1,ret1) /* r29 */ + + SUBSPA_MILLI_DIV + ATTR_MILLI + + .proc + .callinfo millicode + .entry +/* NONE of these routines require a stack frame + ALL of these routines are unwindable from millicode */ + +GSYM($$divide_by_constant) + .export $$divide_by_constant,millicode +/* Provides a "nice" label for the code covered by the unwind descriptor + for things like gprof. */ + +/* DIVISION BY 2 (shift by 1) */ +GSYM($$divI_2) + .export $$divI_2,millicode + comclr,>= arg0,0,0 + addi 1,arg0,arg0 + MILLIRET + extrs arg0,30,31,ret1 + + +/* DIVISION BY 4 (shift by 2) */ +GSYM($$divI_4) + .export $$divI_4,millicode + comclr,>= arg0,0,0 + addi 3,arg0,arg0 + MILLIRET + extrs arg0,29,30,ret1 + + +/* DIVISION BY 8 (shift by 3) */ +GSYM($$divI_8) + .export $$divI_8,millicode + comclr,>= arg0,0,0 + addi 7,arg0,arg0 + MILLIRET + extrs arg0,28,29,ret1 + +/* DIVISION BY 16 (shift by 4) */ +GSYM($$divI_16) + .export $$divI_16,millicode + comclr,>= arg0,0,0 + addi 15,arg0,arg0 + MILLIRET + extrs arg0,27,28,ret1 + +/**************************************************************************** +* +* DIVISION BY DIVISORS OF FFFFFFFF, and powers of 2 times these +* +* includes 3,5,15,17 and also 6,10,12 +* +****************************************************************************/ + +/* DIVISION BY 3 (use z = 2**32; a = 55555555) */ + +GSYM($$divI_3) + .export $$divI_3,millicode + comb,<,N x2,0,LREF(neg3) + + addi 1,x2,x2 /* this cannot overflow */ + extru x2,1,2,x1 /* multiply by 5 to get started */ + sh2add x2,x2,x2 + b LREF(pos) + addc x1,0,x1 + +LSYM(neg3) + subi 1,x2,x2 /* this cannot overflow */ + extru x2,1,2,x1 /* multiply by 5 to get started */ + sh2add x2,x2,x2 + b LREF(neg) + addc x1,0,x1 + +GSYM($$divU_3) + .export $$divU_3,millicode + addi 1,x2,x2 /* this CAN overflow */ + addc 0,0,x1 + shd x1,x2,30,t1 /* multiply by 5 to get started */ + sh2add x2,x2,x2 + b LREF(pos) + addc x1,t1,x1 + +/* DIVISION BY 5 (use z = 2**32; a = 33333333) */ + +GSYM($$divI_5) + .export $$divI_5,millicode + comb,<,N x2,0,LREF(neg5) + + addi 3,x2,t1 /* this cannot overflow */ + sh1add x2,t1,x2 /* multiply by 3 to get started */ + b LREF(pos) + addc 0,0,x1 + +LSYM(neg5) + sub 0,x2,x2 /* negate x2 */ + addi 1,x2,x2 /* this cannot overflow */ + shd 0,x2,31,x1 /* get top bit (can be 1) */ + sh1add x2,x2,x2 /* multiply by 3 to get started */ + b LREF(neg) + addc x1,0,x1 + +GSYM($$divU_5) + .export $$divU_5,millicode + addi 1,x2,x2 /* this CAN overflow */ + addc 0,0,x1 + shd x1,x2,31,t1 /* multiply by 3 to get started */ + sh1add x2,x2,x2 + b LREF(pos) + addc t1,x1,x1 + +/* DIVISION BY 6 (shift to divide by 2 then divide by 3) */ +GSYM($$divI_6) + .export $$divI_6,millicode + comb,<,N x2,0,LREF(neg6) + extru x2,30,31,x2 /* divide by 2 */ + addi 5,x2,t1 /* compute 5*(x2+1) = 5*x2+5 */ + sh2add x2,t1,x2 /* multiply by 5 to get started */ + b LREF(pos) + addc 0,0,x1 + +LSYM(neg6) + subi 2,x2,x2 /* negate, divide by 2, and add 1 */ + /* negation and adding 1 are done */ + /* at the same time by the SUBI */ + extru x2,30,31,x2 + shd 0,x2,30,x1 + sh2add x2,x2,x2 /* multiply by 5 to get started */ + b LREF(neg) + addc x1,0,x1 + +GSYM($$divU_6) + .export $$divU_6,millicode + extru x2,30,31,x2 /* divide by 2 */ + addi 1,x2,x2 /* cannot carry */ + shd 0,x2,30,x1 /* multiply by 5 to get started */ + sh2add x2,x2,x2 + b LREF(pos) + addc x1,0,x1 + +/* DIVISION BY 10 (shift to divide by 2 then divide by 5) */ +GSYM($$divU_10) + .export $$divU_10,millicode + extru x2,30,31,x2 /* divide by 2 */ + addi 3,x2,t1 /* compute 3*(x2+1) = (3*x2)+3 */ + sh1add x2,t1,x2 /* multiply by 3 to get started */ + addc 0,0,x1 +LSYM(pos) + shd x1,x2,28,t1 /* multiply by 0x11 */ + shd x2,0,28,t2 + add x2,t2,x2 + addc x1,t1,x1 +LSYM(pos_for_17) + shd x1,x2,24,t1 /* multiply by 0x101 */ + shd x2,0,24,t2 + add x2,t2,x2 + addc x1,t1,x1 + + shd x1,x2,16,t1 /* multiply by 0x10001 */ + shd x2,0,16,t2 + add x2,t2,x2 + MILLIRET + addc x1,t1,x1 + +GSYM($$divI_10) + .export $$divI_10,millicode + comb,< x2,0,LREF(neg10) + copy 0,x1 + extru x2,30,31,x2 /* divide by 2 */ + addib,TR 1,x2,LREF(pos) /* add 1 (cannot overflow) */ + sh1add x2,x2,x2 /* multiply by 3 to get started */ + +LSYM(neg10) + subi 2,x2,x2 /* negate, divide by 2, and add 1 */ + /* negation and adding 1 are done */ + /* at the same time by the SUBI */ + extru x2,30,31,x2 + sh1add x2,x2,x2 /* multiply by 3 to get started */ +LSYM(neg) + shd x1,x2,28,t1 /* multiply by 0x11 */ + shd x2,0,28,t2 + add x2,t2,x2 + addc x1,t1,x1 +LSYM(neg_for_17) + shd x1,x2,24,t1 /* multiply by 0x101 */ + shd x2,0,24,t2 + add x2,t2,x2 + addc x1,t1,x1 + + shd x1,x2,16,t1 /* multiply by 0x10001 */ + shd x2,0,16,t2 + add x2,t2,x2 + addc x1,t1,x1 + MILLIRET + sub 0,x1,x1 + +/* DIVISION BY 12 (shift to divide by 4 then divide by 3) */ +GSYM($$divI_12) + .export $$divI_12,millicode + comb,< x2,0,LREF(neg12) + copy 0,x1 + extru x2,29,30,x2 /* divide by 4 */ + addib,tr 1,x2,LREF(pos) /* compute 5*(x2+1) = 5*x2+5 */ + sh2add x2,x2,x2 /* multiply by 5 to get started */ + +LSYM(neg12) + subi 4,x2,x2 /* negate, divide by 4, and add 1 */ + /* negation and adding 1 are done */ + /* at the same time by the SUBI */ + extru x2,29,30,x2 + b LREF(neg) + sh2add x2,x2,x2 /* multiply by 5 to get started */ + +GSYM($$divU_12) + .export $$divU_12,millicode + extru x2,29,30,x2 /* divide by 4 */ + addi 5,x2,t1 /* cannot carry */ + sh2add x2,t1,x2 /* multiply by 5 to get started */ + b LREF(pos) + addc 0,0,x1 + +/* DIVISION BY 15 (use z = 2**32; a = 11111111) */ +GSYM($$divI_15) + .export $$divI_15,millicode + comb,< x2,0,LREF(neg15) + copy 0,x1 + addib,tr 1,x2,LREF(pos)+4 + shd x1,x2,28,t1 + +LSYM(neg15) + b LREF(neg) + subi 1,x2,x2 + +GSYM($$divU_15) + .export $$divU_15,millicode + addi 1,x2,x2 /* this CAN overflow */ + b LREF(pos) + addc 0,0,x1 + +/* DIVISION BY 17 (use z = 2**32; a = f0f0f0f) */ +GSYM($$divI_17) + .export $$divI_17,millicode + comb,<,n x2,0,LREF(neg17) + addi 1,x2,x2 /* this cannot overflow */ + shd 0,x2,28,t1 /* multiply by 0xf to get started */ + shd x2,0,28,t2 + sub t2,x2,x2 + b LREF(pos_for_17) + subb t1,0,x1 + +LSYM(neg17) + subi 1,x2,x2 /* this cannot overflow */ + shd 0,x2,28,t1 /* multiply by 0xf to get started */ + shd x2,0,28,t2 + sub t2,x2,x2 + b LREF(neg_for_17) + subb t1,0,x1 + +GSYM($$divU_17) + .export $$divU_17,millicode + addi 1,x2,x2 /* this CAN overflow */ + addc 0,0,x1 + shd x1,x2,28,t1 /* multiply by 0xf to get started */ +LSYM(u17) + shd x2,0,28,t2 + sub t2,x2,x2 + b LREF(pos_for_17) + subb t1,x1,x1 + + +/* DIVISION BY DIVISORS OF FFFFFF, and powers of 2 times these + includes 7,9 and also 14 + + + z = 2**24-1 + r = z mod x = 0 + + so choose b = 0 + + Also, in order to divide by z = 2**24-1, we approximate by dividing + by (z+1) = 2**24 (which is easy), and then correcting. + + (ax) = (z+1)q' + r + . = zq' + (q'+r) + + So to compute (ax)/z, compute q' = (ax)/(z+1) and r = (ax) mod (z+1) + Then the true remainder of (ax)/z is (q'+r). Repeat the process + with this new remainder, adding the tentative quotients together, + until a tentative quotient is 0 (and then we are done). There is + one last correction to be done. It is possible that (q'+r) = z. + If so, then (q'+r)/(z+1) = 0 and it looks like we are done. But, + in fact, we need to add 1 more to the quotient. Now, it turns + out that this happens if and only if the original value x is + an exact multiple of y. So, to avoid a three instruction test at + the end, instead use 1 instruction to add 1 to x at the beginning. */ + +/* DIVISION BY 7 (use z = 2**24-1; a = 249249) */ +GSYM($$divI_7) + .export $$divI_7,millicode + comb,<,n x2,0,LREF(neg7) +LSYM(7) + addi 1,x2,x2 /* cannot overflow */ + shd 0,x2,29,x1 + sh3add x2,x2,x2 + addc x1,0,x1 +LSYM(pos7) + shd x1,x2,26,t1 + shd x2,0,26,t2 + add x2,t2,x2 + addc x1,t1,x1 + + shd x1,x2,20,t1 + shd x2,0,20,t2 + add x2,t2,x2 + addc x1,t1,t1 + + /* computed <t1,x2>. Now divide it by (2**24 - 1) */ + + copy 0,x1 + shd,= t1,x2,24,t1 /* tentative quotient */ +LSYM(1) + addb,tr t1,x1,LREF(2) /* add to previous quotient */ + extru x2,31,24,x2 /* new remainder (unadjusted) */ + + MILLIRETN + +LSYM(2) + addb,tr t1,x2,LREF(1) /* adjust remainder */ + extru,= x2,7,8,t1 /* new quotient */ + +LSYM(neg7) + subi 1,x2,x2 /* negate x2 and add 1 */ +LSYM(8) + shd 0,x2,29,x1 + sh3add x2,x2,x2 + addc x1,0,x1 + +LSYM(neg7_shift) + shd x1,x2,26,t1 + shd x2,0,26,t2 + add x2,t2,x2 + addc x1,t1,x1 + + shd x1,x2,20,t1 + shd x2,0,20,t2 + add x2,t2,x2 + addc x1,t1,t1 + + /* computed <t1,x2>. Now divide it by (2**24 - 1) */ + + copy 0,x1 + shd,= t1,x2,24,t1 /* tentative quotient */ +LSYM(3) + addb,tr t1,x1,LREF(4) /* add to previous quotient */ + extru x2,31,24,x2 /* new remainder (unadjusted) */ + + MILLIRET + sub 0,x1,x1 /* negate result */ + +LSYM(4) + addb,tr t1,x2,LREF(3) /* adjust remainder */ + extru,= x2,7,8,t1 /* new quotient */ + +GSYM($$divU_7) + .export $$divU_7,millicode + addi 1,x2,x2 /* can carry */ + addc 0,0,x1 + shd x1,x2,29,t1 + sh3add x2,x2,x2 + b LREF(pos7) + addc t1,x1,x1 + +/* DIVISION BY 9 (use z = 2**24-1; a = 1c71c7) */ +GSYM($$divI_9) + .export $$divI_9,millicode + comb,<,n x2,0,LREF(neg9) + addi 1,x2,x2 /* cannot overflow */ + shd 0,x2,29,t1 + shd x2,0,29,t2 + sub t2,x2,x2 + b LREF(pos7) + subb t1,0,x1 + +LSYM(neg9) + subi 1,x2,x2 /* negate and add 1 */ + shd 0,x2,29,t1 + shd x2,0,29,t2 + sub t2,x2,x2 + b LREF(neg7_shift) + subb t1,0,x1 + +GSYM($$divU_9) + .export $$divU_9,millicode + addi 1,x2,x2 /* can carry */ + addc 0,0,x1 + shd x1,x2,29,t1 + shd x2,0,29,t2 + sub t2,x2,x2 + b LREF(pos7) + subb t1,x1,x1 + +/* DIVISION BY 14 (shift to divide by 2 then divide by 7) */ +GSYM($$divI_14) + .export $$divI_14,millicode + comb,<,n x2,0,LREF(neg14) +GSYM($$divU_14) + .export $$divU_14,millicode + b LREF(7) /* go to 7 case */ + extru x2,30,31,x2 /* divide by 2 */ + +LSYM(neg14) + subi 2,x2,x2 /* negate (and add 2) */ + b LREF(8) + extru x2,30,31,x2 /* divide by 2 */ + .exit + .procend + .end +#endif + +#ifdef L_mulI +/* VERSION "@(#)$$mulI $ Revision: 12.4 $ $ Date: 94/03/17 17:18:51 $" */ +/****************************************************************************** +This routine is used on PA2.0 processors when gcc -mno-fpregs is used + +ROUTINE: $$mulI + + +DESCRIPTION: + + $$mulI multiplies two single word integers, giving a single + word result. + + +INPUT REGISTERS: + + arg0 = Operand 1 + arg1 = Operand 2 + r31 == return pc + sr0 == return space when called externally + + +OUTPUT REGISTERS: + + arg0 = undefined + arg1 = undefined + ret1 = result + +OTHER REGISTERS AFFECTED: + + r1 = undefined + +SIDE EFFECTS: + + Causes a trap under the following conditions: NONE + Changes memory at the following places: NONE + +PERMISSIBLE CONTEXT: + + Unwindable + Does not create a stack frame + Is usable for internal or external microcode + +DISCUSSION: + + Calls other millicode routines via mrp: NONE + Calls other millicode routines: NONE + +***************************************************************************/ + + +#define a0 %arg0 +#define a1 %arg1 +#define t0 %r1 +#define r %ret1 + +#define a0__128a0 zdep a0,24,25,a0 +#define a0__256a0 zdep a0,23,24,a0 +#define a1_ne_0_b_l0 comb,<> a1,0,LREF(l0) +#define a1_ne_0_b_l1 comb,<> a1,0,LREF(l1) +#define a1_ne_0_b_l2 comb,<> a1,0,LREF(l2) +#define b_n_ret_t0 b,n LREF(ret_t0) +#define b_e_shift b LREF(e_shift) +#define b_e_t0ma0 b LREF(e_t0ma0) +#define b_e_t0 b LREF(e_t0) +#define b_e_t0a0 b LREF(e_t0a0) +#define b_e_t02a0 b LREF(e_t02a0) +#define b_e_t04a0 b LREF(e_t04a0) +#define b_e_2t0 b LREF(e_2t0) +#define b_e_2t0a0 b LREF(e_2t0a0) +#define b_e_2t04a0 b LREF(e2t04a0) +#define b_e_3t0 b LREF(e_3t0) +#define b_e_4t0 b LREF(e_4t0) +#define b_e_4t0a0 b LREF(e_4t0a0) +#define b_e_4t08a0 b LREF(e4t08a0) +#define b_e_5t0 b LREF(e_5t0) +#define b_e_8t0 b LREF(e_8t0) +#define b_e_8t0a0 b LREF(e_8t0a0) +#define r__r_a0 add r,a0,r +#define r__r_2a0 sh1add a0,r,r +#define r__r_4a0 sh2add a0,r,r +#define r__r_8a0 sh3add a0,r,r +#define r__r_t0 add r,t0,r +#define r__r_2t0 sh1add t0,r,r +#define r__r_4t0 sh2add t0,r,r +#define r__r_8t0 sh3add t0,r,r +#define t0__3a0 sh1add a0,a0,t0 +#define t0__4a0 sh2add a0,0,t0 +#define t0__5a0 sh2add a0,a0,t0 +#define t0__8a0 sh3add a0,0,t0 +#define t0__9a0 sh3add a0,a0,t0 +#define t0__16a0 zdep a0,27,28,t0 +#define t0__32a0 zdep a0,26,27,t0 +#define t0__64a0 zdep a0,25,26,t0 +#define t0__128a0 zdep a0,24,25,t0 +#define t0__t0ma0 sub t0,a0,t0 +#define t0__t0_a0 add t0,a0,t0 +#define t0__t0_2a0 sh1add a0,t0,t0 +#define t0__t0_4a0 sh2add a0,t0,t0 +#define t0__t0_8a0 sh3add a0,t0,t0 +#define t0__2t0_a0 sh1add t0,a0,t0 +#define t0__3t0 sh1add t0,t0,t0 +#define t0__4t0 sh2add t0,0,t0 +#define t0__4t0_a0 sh2add t0,a0,t0 +#define t0__5t0 sh2add t0,t0,t0 +#define t0__8t0 sh3add t0,0,t0 +#define t0__8t0_a0 sh3add t0,a0,t0 +#define t0__9t0 sh3add t0,t0,t0 +#define t0__16t0 zdep t0,27,28,t0 +#define t0__32t0 zdep t0,26,27,t0 +#define t0__256a0 zdep a0,23,24,t0 + + + SUBSPA_MILLI + ATTR_MILLI + .align 16 + .proc + .callinfo millicode + .export $$mulI,millicode +GSYM($$mulI) + combt,<<= a1,a0,LREF(l4) /* swap args if unsigned a1>a0 */ + copy 0,r /* zero out the result */ + xor a0,a1,a0 /* swap a0 & a1 using the */ + xor a0,a1,a1 /* old xor trick */ + xor a0,a1,a0 +LSYM(l4) + combt,<= 0,a0,LREF(l3) /* if a0>=0 then proceed like unsigned */ + zdep a1,30,8,t0 /* t0 = (a1&0xff)<<1 ********* */ + sub,> 0,a1,t0 /* otherwise negate both and */ + combt,<=,n a0,t0,LREF(l2) /* swap back if |a0|<|a1| */ + sub 0,a0,a1 + movb,tr,n t0,a0,LREF(l2) /* 10th inst. */ + +LSYM(l0) r__r_t0 /* add in this partial product */ +LSYM(l1) a0__256a0 /* a0 <<= 8 ****************** */ +LSYM(l2) zdep a1,30,8,t0 /* t0 = (a1&0xff)<<1 ********* */ +LSYM(l3) blr t0,0 /* case on these 8 bits ****** */ + extru a1,23,24,a1 /* a1 >>= 8 ****************** */ + +/*16 insts before this. */ +/* a0 <<= 8 ************************** */ +LSYM(x0) a1_ne_0_b_l2 ! a0__256a0 ! MILLIRETN ! nop +LSYM(x1) a1_ne_0_b_l1 ! r__r_a0 ! MILLIRETN ! nop +LSYM(x2) a1_ne_0_b_l1 ! r__r_2a0 ! MILLIRETN ! nop +LSYM(x3) a1_ne_0_b_l0 ! t0__3a0 ! MILLIRET ! r__r_t0 +LSYM(x4) a1_ne_0_b_l1 ! r__r_4a0 ! MILLIRETN ! nop +LSYM(x5) a1_ne_0_b_l0 ! t0__5a0 ! MILLIRET ! r__r_t0 +LSYM(x6) t0__3a0 ! a1_ne_0_b_l1 ! r__r_2t0 ! MILLIRETN +LSYM(x7) t0__3a0 ! a1_ne_0_b_l0 ! r__r_4a0 ! b_n_ret_t0 +LSYM(x8) a1_ne_0_b_l1 ! r__r_8a0 ! MILLIRETN ! nop +LSYM(x9) a1_ne_0_b_l0 ! t0__9a0 ! MILLIRET ! r__r_t0 +LSYM(x10) t0__5a0 ! a1_ne_0_b_l1 ! r__r_2t0 ! MILLIRETN +LSYM(x11) t0__3a0 ! a1_ne_0_b_l0 ! r__r_8a0 ! b_n_ret_t0 +LSYM(x12) t0__3a0 ! a1_ne_0_b_l1 ! r__r_4t0 ! MILLIRETN +LSYM(x13) t0__5a0 ! a1_ne_0_b_l0 ! r__r_8a0 ! b_n_ret_t0 +LSYM(x14) t0__3a0 ! t0__2t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x15) t0__5a0 ! a1_ne_0_b_l0 ! t0__3t0 ! b_n_ret_t0 +LSYM(x16) t0__16a0 ! a1_ne_0_b_l1 ! r__r_t0 ! MILLIRETN +LSYM(x17) t0__9a0 ! a1_ne_0_b_l0 ! t0__t0_8a0 ! b_n_ret_t0 +LSYM(x18) t0__9a0 ! a1_ne_0_b_l1 ! r__r_2t0 ! MILLIRETN +LSYM(x19) t0__9a0 ! a1_ne_0_b_l0 ! t0__2t0_a0 ! b_n_ret_t0 +LSYM(x20) t0__5a0 ! a1_ne_0_b_l1 ! r__r_4t0 ! MILLIRETN +LSYM(x21) t0__5a0 ! a1_ne_0_b_l0 ! t0__4t0_a0 ! b_n_ret_t0 +LSYM(x22) t0__5a0 ! t0__2t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x23) t0__5a0 ! t0__2t0_a0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x24) t0__3a0 ! a1_ne_0_b_l1 ! r__r_8t0 ! MILLIRETN +LSYM(x25) t0__5a0 ! a1_ne_0_b_l0 ! t0__5t0 ! b_n_ret_t0 +LSYM(x26) t0__3a0 ! t0__4t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x27) t0__3a0 ! a1_ne_0_b_l0 ! t0__9t0 ! b_n_ret_t0 +LSYM(x28) t0__3a0 ! t0__2t0_a0 ! b_e_shift ! r__r_4t0 +LSYM(x29) t0__3a0 ! t0__2t0_a0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x30) t0__5a0 ! t0__3t0 ! b_e_shift ! r__r_2t0 +LSYM(x31) t0__32a0 ! a1_ne_0_b_l0 ! t0__t0ma0 ! b_n_ret_t0 +LSYM(x32) t0__32a0 ! a1_ne_0_b_l1 ! r__r_t0 ! MILLIRETN +LSYM(x33) t0__8a0 ! a1_ne_0_b_l0 ! t0__4t0_a0 ! b_n_ret_t0 +LSYM(x34) t0__16a0 ! t0__t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x35) t0__9a0 ! t0__3t0 ! b_e_t0 ! t0__t0_8a0 +LSYM(x36) t0__9a0 ! a1_ne_0_b_l1 ! r__r_4t0 ! MILLIRETN +LSYM(x37) t0__9a0 ! a1_ne_0_b_l0 ! t0__4t0_a0 ! b_n_ret_t0 +LSYM(x38) t0__9a0 ! t0__2t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x39) t0__9a0 ! t0__2t0_a0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x40) t0__5a0 ! a1_ne_0_b_l1 ! r__r_8t0 ! MILLIRETN +LSYM(x41) t0__5a0 ! a1_ne_0_b_l0 ! t0__8t0_a0 ! b_n_ret_t0 +LSYM(x42) t0__5a0 ! t0__4t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x43) t0__5a0 ! t0__4t0_a0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x44) t0__5a0 ! t0__2t0_a0 ! b_e_shift ! r__r_4t0 +LSYM(x45) t0__9a0 ! a1_ne_0_b_l0 ! t0__5t0 ! b_n_ret_t0 +LSYM(x46) t0__9a0 ! t0__5t0 ! b_e_t0 ! t0__t0_a0 +LSYM(x47) t0__9a0 ! t0__5t0 ! b_e_t0 ! t0__t0_2a0 +LSYM(x48) t0__3a0 ! a1_ne_0_b_l0 ! t0__16t0 ! b_n_ret_t0 +LSYM(x49) t0__9a0 ! t0__5t0 ! b_e_t0 ! t0__t0_4a0 +LSYM(x50) t0__5a0 ! t0__5t0 ! b_e_shift ! r__r_2t0 +LSYM(x51) t0__9a0 ! t0__t0_8a0 ! b_e_t0 ! t0__3t0 +LSYM(x52) t0__3a0 ! t0__4t0_a0 ! b_e_shift ! r__r_4t0 +LSYM(x53) t0__3a0 ! t0__4t0_a0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x54) t0__9a0 ! t0__3t0 ! b_e_shift ! r__r_2t0 +LSYM(x55) t0__9a0 ! t0__3t0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x56) t0__3a0 ! t0__2t0_a0 ! b_e_shift ! r__r_8t0 +LSYM(x57) t0__9a0 ! t0__2t0_a0 ! b_e_t0 ! t0__3t0 +LSYM(x58) t0__3a0 ! t0__2t0_a0 ! b_e_2t0 ! t0__4t0_a0 +LSYM(x59) t0__9a0 ! t0__2t0_a0 ! b_e_t02a0 ! t0__3t0 +LSYM(x60) t0__5a0 ! t0__3t0 ! b_e_shift ! r__r_4t0 +LSYM(x61) t0__5a0 ! t0__3t0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x62) t0__32a0 ! t0__t0ma0 ! b_e_shift ! r__r_2t0 +LSYM(x63) t0__64a0 ! a1_ne_0_b_l0 ! t0__t0ma0 ! b_n_ret_t0 +LSYM(x64) t0__64a0 ! a1_ne_0_b_l1 ! r__r_t0 ! MILLIRETN +LSYM(x65) t0__8a0 ! a1_ne_0_b_l0 ! t0__8t0_a0 ! b_n_ret_t0 +LSYM(x66) t0__32a0 ! t0__t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x67) t0__8a0 ! t0__4t0_a0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x68) t0__8a0 ! t0__2t0_a0 ! b_e_shift ! r__r_4t0 +LSYM(x69) t0__8a0 ! t0__2t0_a0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x70) t0__64a0 ! t0__t0_4a0 ! b_e_t0 ! t0__t0_2a0 +LSYM(x71) t0__9a0 ! t0__8t0 ! b_e_t0 ! t0__t0ma0 +LSYM(x72) t0__9a0 ! a1_ne_0_b_l1 ! r__r_8t0 ! MILLIRETN +LSYM(x73) t0__9a0 ! t0__8t0_a0 ! b_e_shift ! r__r_t0 +LSYM(x74) t0__9a0 ! t0__4t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x75) t0__9a0 ! t0__4t0_a0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x76) t0__9a0 ! t0__2t0_a0 ! b_e_shift ! r__r_4t0 +LSYM(x77) t0__9a0 ! t0__2t0_a0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x78) t0__9a0 ! t0__2t0_a0 ! b_e_2t0 ! t0__2t0_a0 +LSYM(x79) t0__16a0 ! t0__5t0 ! b_e_t0 ! t0__t0ma0 +LSYM(x80) t0__16a0 ! t0__5t0 ! b_e_shift ! r__r_t0 +LSYM(x81) t0__9a0 ! t0__9t0 ! b_e_shift ! r__r_t0 +LSYM(x82) t0__5a0 ! t0__8t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x83) t0__5a0 ! t0__8t0_a0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x84) t0__5a0 ! t0__4t0_a0 ! b_e_shift ! r__r_4t0 +LSYM(x85) t0__8a0 ! t0__2t0_a0 ! b_e_t0 ! t0__5t0 +LSYM(x86) t0__5a0 ! t0__4t0_a0 ! b_e_2t0 ! t0__2t0_a0 +LSYM(x87) t0__9a0 ! t0__9t0 ! b_e_t02a0 ! t0__t0_4a0 +LSYM(x88) t0__5a0 ! t0__2t0_a0 ! b_e_shift ! r__r_8t0 +LSYM(x89) t0__5a0 ! t0__2t0_a0 ! b_e_t0 ! t0__8t0_a0 +LSYM(x90) t0__9a0 ! t0__5t0 ! b_e_shift ! r__r_2t0 +LSYM(x91) t0__9a0 ! t0__5t0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x92) t0__5a0 ! t0__2t0_a0 ! b_e_4t0 ! t0__2t0_a0 +LSYM(x93) t0__32a0 ! t0__t0ma0 ! b_e_t0 ! t0__3t0 +LSYM(x94) t0__9a0 ! t0__5t0 ! b_e_2t0 ! t0__t0_2a0 +LSYM(x95) t0__9a0 ! t0__2t0_a0 ! b_e_t0 ! t0__5t0 +LSYM(x96) t0__8a0 ! t0__3t0 ! b_e_shift ! r__r_4t0 +LSYM(x97) t0__8a0 ! t0__3t0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x98) t0__32a0 ! t0__3t0 ! b_e_t0 ! t0__t0_2a0 +LSYM(x99) t0__8a0 ! t0__4t0_a0 ! b_e_t0 ! t0__3t0 +LSYM(x100) t0__5a0 ! t0__5t0 ! b_e_shift ! r__r_4t0 +LSYM(x101) t0__5a0 ! t0__5t0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x102) t0__32a0 ! t0__t0_2a0 ! b_e_t0 ! t0__3t0 +LSYM(x103) t0__5a0 ! t0__5t0 ! b_e_t02a0 ! t0__4t0_a0 +LSYM(x104) t0__3a0 ! t0__4t0_a0 ! b_e_shift ! r__r_8t0 +LSYM(x105) t0__5a0 ! t0__4t0_a0 ! b_e_t0 ! t0__5t0 +LSYM(x106) t0__3a0 ! t0__4t0_a0 ! b_e_2t0 ! t0__4t0_a0 +LSYM(x107) t0__9a0 ! t0__t0_4a0 ! b_e_t02a0 ! t0__8t0_a0 +LSYM(x108) t0__9a0 ! t0__3t0 ! b_e_shift ! r__r_4t0 +LSYM(x109) t0__9a0 ! t0__3t0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x110) t0__9a0 ! t0__3t0 ! b_e_2t0 ! t0__2t0_a0 +LSYM(x111) t0__9a0 ! t0__4t0_a0 ! b_e_t0 ! t0__3t0 +LSYM(x112) t0__3a0 ! t0__2t0_a0 ! b_e_t0 ! t0__16t0 +LSYM(x113) t0__9a0 ! t0__4t0_a0 ! b_e_t02a0 ! t0__3t0 +LSYM(x114) t0__9a0 ! t0__2t0_a0 ! b_e_2t0 ! t0__3t0 +LSYM(x115) t0__9a0 ! t0__2t0_a0 ! b_e_2t0a0 ! t0__3t0 +LSYM(x116) t0__3a0 ! t0__2t0_a0 ! b_e_4t0 ! t0__4t0_a0 +LSYM(x117) t0__3a0 ! t0__4t0_a0 ! b_e_t0 ! t0__9t0 +LSYM(x118) t0__3a0 ! t0__4t0_a0 ! b_e_t0a0 ! t0__9t0 +LSYM(x119) t0__3a0 ! t0__4t0_a0 ! b_e_t02a0 ! t0__9t0 +LSYM(x120) t0__5a0 ! t0__3t0 ! b_e_shift ! r__r_8t0 +LSYM(x121) t0__5a0 ! t0__3t0 ! b_e_t0 ! t0__8t0_a0 +LSYM(x122) t0__5a0 ! t0__3t0 ! b_e_2t0 ! t0__4t0_a0 +LSYM(x123) t0__5a0 ! t0__8t0_a0 ! b_e_t0 ! t0__3t0 +LSYM(x124) t0__32a0 ! t0__t0ma0 ! b_e_shift ! r__r_4t0 +LSYM(x125) t0__5a0 ! t0__5t0 ! b_e_t0 ! t0__5t0 +LSYM(x126) t0__64a0 ! t0__t0ma0 ! b_e_shift ! r__r_2t0 +LSYM(x127) t0__128a0 ! a1_ne_0_b_l0 ! t0__t0ma0 ! b_n_ret_t0 +LSYM(x128) t0__128a0 ! a1_ne_0_b_l1 ! r__r_t0 ! MILLIRETN +LSYM(x129) t0__128a0 ! a1_ne_0_b_l0 ! t0__t0_a0 ! b_n_ret_t0 +LSYM(x130) t0__64a0 ! t0__t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x131) t0__8a0 ! t0__8t0_a0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x132) t0__8a0 ! t0__4t0_a0 ! b_e_shift ! r__r_4t0 +LSYM(x133) t0__8a0 ! t0__4t0_a0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x134) t0__8a0 ! t0__4t0_a0 ! b_e_2t0 ! t0__2t0_a0 +LSYM(x135) t0__9a0 ! t0__5t0 ! b_e_t0 ! t0__3t0 +LSYM(x136) t0__8a0 ! t0__2t0_a0 ! b_e_shift ! r__r_8t0 +LSYM(x137) t0__8a0 ! t0__2t0_a0 ! b_e_t0 ! t0__8t0_a0 +LSYM(x138) t0__8a0 ! t0__2t0_a0 ! b_e_2t0 ! t0__4t0_a0 +LSYM(x139) t0__8a0 ! t0__2t0_a0 ! b_e_2t0a0 ! t0__4t0_a0 +LSYM(x140) t0__3a0 ! t0__2t0_a0 ! b_e_4t0 ! t0__5t0 +LSYM(x141) t0__8a0 ! t0__2t0_a0 ! b_e_4t0a0 ! t0__2t0_a0 +LSYM(x142) t0__9a0 ! t0__8t0 ! b_e_2t0 ! t0__t0ma0 +LSYM(x143) t0__16a0 ! t0__9t0 ! b_e_t0 ! t0__t0ma0 +LSYM(x144) t0__9a0 ! t0__8t0 ! b_e_shift ! r__r_2t0 +LSYM(x145) t0__9a0 ! t0__8t0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x146) t0__9a0 ! t0__8t0_a0 ! b_e_shift ! r__r_2t0 +LSYM(x147) t0__9a0 ! t0__8t0_a0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x148) t0__9a0 ! t0__4t0_a0 ! b_e_shift ! r__r_4t0 +LSYM(x149) t0__9a0 ! t0__4t0_a0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x150) t0__9a0 ! t0__4t0_a0 ! b_e_2t0 ! t0__2t0_a0 +LSYM(x151) t0__9a0 ! t0__4t0_a0 ! b_e_2t0a0 ! t0__2t0_a0 +LSYM(x152) t0__9a0 ! t0__2t0_a0 ! b_e_shift ! r__r_8t0 +LSYM(x153) t0__9a0 ! t0__2t0_a0 ! b_e_t0 ! t0__8t0_a0 +LSYM(x154) t0__9a0 ! t0__2t0_a0 ! b_e_2t0 ! t0__4t0_a0 +LSYM(x155) t0__32a0 ! t0__t0ma0 ! b_e_t0 ! t0__5t0 +LSYM(x156) t0__9a0 ! t0__2t0_a0 ! b_e_4t0 ! t0__2t0_a0 +LSYM(x157) t0__32a0 ! t0__t0ma0 ! b_e_t02a0 ! t0__5t0 +LSYM(x158) t0__16a0 ! t0__5t0 ! b_e_2t0 ! t0__t0ma0 +LSYM(x159) t0__32a0 ! t0__5t0 ! b_e_t0 ! t0__t0ma0 +LSYM(x160) t0__5a0 ! t0__4t0 ! b_e_shift ! r__r_8t0 +LSYM(x161) t0__8a0 ! t0__5t0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x162) t0__9a0 ! t0__9t0 ! b_e_shift ! r__r_2t0 +LSYM(x163) t0__9a0 ! t0__9t0 ! b_e_t0 ! t0__2t0_a0 +LSYM(x164) t0__5a0 ! t0__8t0_a0 ! b_e_shift ! r__r_4t0 +LSYM(x165) t0__8a0 ! t0__4t0_a0 ! b_e_t0 ! t0__5t0 +LSYM(x166) t0__5a0 ! t0__8t0_a0 ! b_e_2t0 ! t0__2t0_a0 +LSYM(x167) t0__5a0 ! t0__8t0_a0 ! b_e_2t0a0 ! t0__2t0_a0 +LSYM(x168) t0__5a0 ! t0__4t0_a0 ! b_e_shift ! r__r_8t0 +LSYM(x169) t0__5a0 ! t0__4t0_a0 ! b_e_t0 ! t0__8t0_a0 +LSYM(x170) t0__32a0 ! t0__t0_2a0 ! b_e_t0 ! t0__5t0 +LSYM(x171) t0__9a0 ! t0__2t0_a0 ! b_e_t0 ! t0__9t0 +LSYM(x172) t0__5a0 ! t0__4t0_a0 ! b_e_4t0 ! t0__2t0_a0 +LSYM(x173) t0__9a0 ! t0__2t0_a0 ! b_e_t02a0 ! t0__9t0 +LSYM(x174) t0__32a0 ! t0__t0_2a0 ! b_e_t04a0 ! t0__5t0 +LSYM(x175) t0__8a0 ! t0__2t0_a0 ! b_e_5t0 ! t0__2t0_a0 +LSYM(x176) t0__5a0 ! t0__4t0_a0 ! b_e_8t0 ! t0__t0_a0 +LSYM(x177) t0__5a0 ! t0__4t0_a0 ! b_e_8t0a0 ! t0__t0_a0 +LSYM(x178) t0__5a0 ! t0__2t0_a0 ! b_e_2t0 ! t0__8t0_a0 +LSYM(x179) t0__5a0 ! t0__2t0_a0 ! b_e_2t0a0 ! t0__8t0_a0 +LSYM(x180) t0__9a0 ! t0__5t0 ! b_e_shift ! r__r_4t0 +LSYM(x181) t0__9a0 ! t0__5t0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x182) t0__9a0 ! t0__5t0 ! b_e_2t0 ! t0__2t0_a0 +LSYM(x183) t0__9a0 ! t0__5t0 ! b_e_2t0a0 ! t0__2t0_a0 +LSYM(x184) t0__5a0 ! t0__9t0 ! b_e_4t0 ! t0__t0_a0 +LSYM(x185) t0__9a0 ! t0__4t0_a0 ! b_e_t0 ! t0__5t0 +LSYM(x186) t0__32a0 ! t0__t0ma0 ! b_e_2t0 ! t0__3t0 +LSYM(x187) t0__9a0 ! t0__4t0_a0 ! b_e_t02a0 ! t0__5t0 +LSYM(x188) t0__9a0 ! t0__5t0 ! b_e_4t0 ! t0__t0_2a0 +LSYM(x189) t0__5a0 ! t0__4t0_a0 ! b_e_t0 ! t0__9t0 +LSYM(x190) t0__9a0 ! t0__2t0_a0 ! b_e_2t0 ! t0__5t0 +LSYM(x191) t0__64a0 ! t0__3t0 ! b_e_t0 ! t0__t0ma0 +LSYM(x192) t0__8a0 ! t0__3t0 ! b_e_shift ! r__r_8t0 +LSYM(x193) t0__8a0 ! t0__3t0 ! b_e_t0 ! t0__8t0_a0 +LSYM(x194) t0__8a0 ! t0__3t0 ! b_e_2t0 ! t0__4t0_a0 +LSYM(x195) t0__8a0 ! t0__8t0_a0 ! b_e_t0 ! t0__3t0 +LSYM(x196) t0__8a0 ! t0__3t0 ! b_e_4t0 ! t0__2t0_a0 +LSYM(x197) t0__8a0 ! t0__3t0 ! b_e_4t0a0 ! t0__2t0_a0 +LSYM(x198) t0__64a0 ! t0__t0_2a0 ! b_e_t0 ! t0__3t0 +LSYM(x199) t0__8a0 ! t0__4t0_a0 ! b_e_2t0a0 ! t0__3t0 +LSYM(x200) t0__5a0 ! t0__5t0 ! b_e_shift ! r__r_8t0 +LSYM(x201) t0__5a0 ! t0__5t0 ! b_e_t0 ! t0__8t0_a0 +LSYM(x202) t0__5a0 ! t0__5t0 ! b_e_2t0 ! t0__4t0_a0 +LSYM(x203) t0__5a0 ! t0__5t0 ! b_e_2t0a0 ! t0__4t0_a0 +LSYM(x204) t0__8a0 ! t0__2t0_a0 ! b_e_4t0 ! t0__3t0 +LSYM(x205) t0__5a0 ! t0__8t0_a0 ! b_e_t0 ! t0__5t0 +LSYM(x206) t0__64a0 ! t0__t0_4a0 ! b_e_t02a0 ! t0__3t0 +LSYM(x207) t0__8a0 ! t0__2t0_a0 ! b_e_3t0 ! t0__4t0_a0 +LSYM(x208) t0__5a0 ! t0__5t0 ! b_e_8t0 ! t0__t0_a0 +LSYM(x209) t0__5a0 ! t0__5t0 ! b_e_8t0a0 ! t0__t0_a0 +LSYM(x210) t0__5a0 ! t0__4t0_a0 ! b_e_2t0 ! t0__5t0 +LSYM(x211) t0__5a0 ! t0__4t0_a0 ! b_e_2t0a0 ! t0__5t0 +LSYM(x212) t0__3a0 ! t0__4t0_a0 ! b_e_4t0 ! t0__4t0_a0 +LSYM(x213) t0__3a0 ! t0__4t0_a0 ! b_e_4t0a0 ! t0__4t0_a0 +LSYM(x214) t0__9a0 ! t0__t0_4a0 ! b_e_2t04a0 ! t0__8t0_a0 +LSYM(x215) t0__5a0 ! t0__4t0_a0 ! b_e_5t0 ! t0__2t0_a0 +LSYM(x216) t0__9a0 ! t0__3t0 ! b_e_shift ! r__r_8t0 +LSYM(x217) t0__9a0 ! t0__3t0 ! b_e_t0 ! t0__8t0_a0 +LSYM(x218) t0__9a0 ! t0__3t0 ! b_e_2t0 ! t0__4t0_a0 +LSYM(x219) t0__9a0 ! t0__8t0_a0 ! b_e_t0 ! t0__3t0 +LSYM(x220) t0__3a0 ! t0__9t0 ! b_e_4t0 ! t0__2t0_a0 +LSYM(x221) t0__3a0 ! t0__9t0 ! b_e_4t0a0 ! t0__2t0_a0 +LSYM(x222) t0__9a0 ! t0__4t0_a0 ! b_e_2t0 ! t0__3t0 +LSYM(x223) t0__9a0 ! t0__4t0_a0 ! b_e_2t0a0 ! t0__3t0 +LSYM(x224) t0__9a0 ! t0__3t0 ! b_e_8t0 ! t0__t0_a0 +LSYM(x225) t0__9a0 ! t0__5t0 ! b_e_t0 ! t0__5t0 +LSYM(x226) t0__3a0 ! t0__2t0_a0 ! b_e_t02a0 ! t0__32t0 +LSYM(x227) t0__9a0 ! t0__5t0 ! b_e_t02a0 ! t0__5t0 +LSYM(x228) t0__9a0 ! t0__2t0_a0 ! b_e_4t0 ! t0__3t0 +LSYM(x229) t0__9a0 ! t0__2t0_a0 ! b_e_4t0a0 ! t0__3t0 +LSYM(x230) t0__9a0 ! t0__5t0 ! b_e_5t0 ! t0__t0_a0 +LSYM(x231) t0__9a0 ! t0__2t0_a0 ! b_e_3t0 ! t0__4t0_a0 +LSYM(x232) t0__3a0 ! t0__2t0_a0 ! b_e_8t0 ! t0__4t0_a0 +LSYM(x233) t0__3a0 ! t0__2t0_a0 ! b_e_8t0a0 ! t0__4t0_a0 +LSYM(x234) t0__3a0 ! t0__4t0_a0 ! b_e_2t0 ! t0__9t0 +LSYM(x235) t0__3a0 ! t0__4t0_a0 ! b_e_2t0a0 ! t0__9t0 +LSYM(x236) t0__9a0 ! t0__2t0_a0 ! b_e_4t08a0 ! t0__3t0 +LSYM(x237) t0__16a0 ! t0__5t0 ! b_e_3t0 ! t0__t0ma0 +LSYM(x238) t0__3a0 ! t0__4t0_a0 ! b_e_2t04a0 ! t0__9t0 +LSYM(x239) t0__16a0 ! t0__5t0 ! b_e_t0ma0 ! t0__3t0 +LSYM(x240) t0__9a0 ! t0__t0_a0 ! b_e_8t0 ! t0__3t0 +LSYM(x241) t0__9a0 ! t0__t0_a0 ! b_e_8t0a0 ! t0__3t0 +LSYM(x242) t0__5a0 ! t0__3t0 ! b_e_2t0 ! t0__8t0_a0 +LSYM(x243) t0__9a0 ! t0__9t0 ! b_e_t0 ! t0__3t0 +LSYM(x244) t0__5a0 ! t0__3t0 ! b_e_4t0 ! t0__4t0_a0 +LSYM(x245) t0__8a0 ! t0__3t0 ! b_e_5t0 ! t0__2t0_a0 +LSYM(x246) t0__5a0 ! t0__8t0_a0 ! b_e_2t0 ! t0__3t0 +LSYM(x247) t0__5a0 ! t0__8t0_a0 ! b_e_2t0a0 ! t0__3t0 +LSYM(x248) t0__32a0 ! t0__t0ma0 ! b_e_shift ! r__r_8t0 +LSYM(x249) t0__32a0 ! t0__t0ma0 ! b_e_t0 ! t0__8t0_a0 +LSYM(x250) t0__5a0 ! t0__5t0 ! b_e_2t0 ! t0__5t0 +LSYM(x251) t0__5a0 ! t0__5t0 ! b_e_2t0a0 ! t0__5t0 +LSYM(x252) t0__64a0 ! t0__t0ma0 ! b_e_shift ! r__r_4t0 +LSYM(x253) t0__64a0 ! t0__t0ma0 ! b_e_t0 ! t0__4t0_a0 +LSYM(x254) t0__128a0 ! t0__t0ma0 ! b_e_shift ! r__r_2t0 +LSYM(x255) t0__256a0 ! a1_ne_0_b_l0 ! t0__t0ma0 ! b_n_ret_t0 +/*1040 insts before this. */ +LSYM(ret_t0) MILLIRET +LSYM(e_t0) r__r_t0 +LSYM(e_shift) a1_ne_0_b_l2 + a0__256a0 /* a0 <<= 8 *********** */ + MILLIRETN +LSYM(e_t0ma0) a1_ne_0_b_l0 + t0__t0ma0 + MILLIRET + r__r_t0 +LSYM(e_t0a0) a1_ne_0_b_l0 + t0__t0_a0 + MILLIRET + r__r_t0 +LSYM(e_t02a0) a1_ne_0_b_l0 + t0__t0_2a0 + MILLIRET + r__r_t0 +LSYM(e_t04a0) a1_ne_0_b_l0 + t0__t0_4a0 + MILLIRET + r__r_t0 +LSYM(e_2t0) a1_ne_0_b_l1 + r__r_2t0 + MILLIRETN +LSYM(e_2t0a0) a1_ne_0_b_l0 + t0__2t0_a0 + MILLIRET + r__r_t0 +LSYM(e2t04a0) t0__t0_2a0 + a1_ne_0_b_l1 + r__r_2t0 + MILLIRETN +LSYM(e_3t0) a1_ne_0_b_l0 + t0__3t0 + MILLIRET + r__r_t0 +LSYM(e_4t0) a1_ne_0_b_l1 + r__r_4t0 + MILLIRETN +LSYM(e_4t0a0) a1_ne_0_b_l0 + t0__4t0_a0 + MILLIRET + r__r_t0 +LSYM(e4t08a0) t0__t0_2a0 + a1_ne_0_b_l1 + r__r_4t0 + MILLIRETN +LSYM(e_5t0) a1_ne_0_b_l0 + t0__5t0 + MILLIRET + r__r_t0 +LSYM(e_8t0) a1_ne_0_b_l1 + r__r_8t0 + MILLIRETN +LSYM(e_8t0a0) a1_ne_0_b_l0 + t0__8t0_a0 + MILLIRET + r__r_t0 + + .procend + .end +#endif diff --git a/libgcc/config/pa/t-linux b/libgcc/config/pa/t-linux new file mode 100644 index 00000000000..d396bf7705a --- /dev/null +++ b/libgcc/config/pa/t-linux @@ -0,0 +1,6 @@ +#Plug millicode routines into libgcc.a We want these on both native and +#cross compiles. We use the "64-bit" routines because the "32-bit" code +#is broken for certain corner cases. + +LIB1ASMSRC = pa/milli64.S +LIB1ASMFUNCS = _divI _divU _remI _remU _div_const _mulI _dyncall diff --git a/libgcc/config/pa/t-linux64 b/libgcc/config/pa/t-linux64 new file mode 100644 index 00000000000..6cb9806ff2e --- /dev/null +++ b/libgcc/config/pa/t-linux64 @@ -0,0 +1,4 @@ +# Plug millicode routines into libgcc.a We want these on both native and +# cross compiles. +# FIXME: Explain. +LIB1ASMFUNCS := $(filter-out _dyncall, $(LIB1ASMFUNCS)) diff --git a/libgcc/config/picochip/lib1funcs.S b/libgcc/config/picochip/lib1funcs.S new file mode 100644 index 00000000000..d344170d248 --- /dev/null +++ b/libgcc/config/picochip/lib1funcs.S @@ -0,0 +1,4 @@ +// picoChip ASM file +// Fake libgcc asm file. This contains nothing, but is used to prevent gcc +// getting upset about the lack of a lib1funcs.S file when LIB1ASMFUNCS is +// defined to switch off the compilation of parts of libgcc. diff --git a/libgcc/config/picochip/t-picochip b/libgcc/config/picochip/t-picochip index 5135d500cbb..a596ec98947 100644 --- a/libgcc/config/picochip/t-picochip +++ b/libgcc/config/picochip/t-picochip @@ -1,2 +1,9 @@ +# Prevent some of the more complicated libgcc functions from being +# compiled. This is because they are generally too big to fit into an +# AE anyway, so there is no point in having them. Also, some don't +# compile properly so we'll ignore them for the moment. +LIB1ASMSRC = picochip/lib1funcs.S +LIB1ASMFUNCS = _mulsc3 _divsc3 + # Turn off the building of exception handling libraries. LIB2ADDEH = diff --git a/libgcc/config/sh/lib1funcs.S b/libgcc/config/sh/lib1funcs.S new file mode 100644 index 00000000000..2f0ca16cd91 --- /dev/null +++ b/libgcc/config/sh/lib1funcs.S @@ -0,0 +1,3933 @@ +/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006, 2009 + Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + + +!! libgcc routines for the Renesas / SuperH SH CPUs. +!! Contributed by Steve Chamberlain. +!! sac@cygnus.com + +!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines +!! recoded in assembly by Toshiyasu Morita +!! tm@netcom.com + +#if defined(__ELF__) && defined(__linux__) +.section .note.GNU-stack,"",%progbits +.previous +#endif + +/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and + ELF local label prefixes by J"orn Rennecke + amylaar@cygnus.com */ + +#include "lib1funcs.h" + +/* t-vxworks needs to build both PIC and non-PIC versions of libgcc, + so it is more convenient to define NO_FPSCR_VALUES here than to + define it on the command line. */ +#if defined __vxworks && defined __PIC__ +#define NO_FPSCR_VALUES +#endif + +#if ! __SH5__ +#ifdef L_ashiftrt + .global GLOBAL(ashiftrt_r4_0) + .global GLOBAL(ashiftrt_r4_1) + .global GLOBAL(ashiftrt_r4_2) + .global GLOBAL(ashiftrt_r4_3) + .global GLOBAL(ashiftrt_r4_4) + .global GLOBAL(ashiftrt_r4_5) + .global GLOBAL(ashiftrt_r4_6) + .global GLOBAL(ashiftrt_r4_7) + .global GLOBAL(ashiftrt_r4_8) + .global GLOBAL(ashiftrt_r4_9) + .global GLOBAL(ashiftrt_r4_10) + .global GLOBAL(ashiftrt_r4_11) + .global GLOBAL(ashiftrt_r4_12) + .global GLOBAL(ashiftrt_r4_13) + .global GLOBAL(ashiftrt_r4_14) + .global GLOBAL(ashiftrt_r4_15) + .global GLOBAL(ashiftrt_r4_16) + .global GLOBAL(ashiftrt_r4_17) + .global GLOBAL(ashiftrt_r4_18) + .global GLOBAL(ashiftrt_r4_19) + .global GLOBAL(ashiftrt_r4_20) + .global GLOBAL(ashiftrt_r4_21) + .global GLOBAL(ashiftrt_r4_22) + .global GLOBAL(ashiftrt_r4_23) + .global GLOBAL(ashiftrt_r4_24) + .global GLOBAL(ashiftrt_r4_25) + .global GLOBAL(ashiftrt_r4_26) + .global GLOBAL(ashiftrt_r4_27) + .global GLOBAL(ashiftrt_r4_28) + .global GLOBAL(ashiftrt_r4_29) + .global GLOBAL(ashiftrt_r4_30) + .global GLOBAL(ashiftrt_r4_31) + .global GLOBAL(ashiftrt_r4_32) + + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_0)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_1)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_2)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_3)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_4)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_5)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_6)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_7)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_8)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_9)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_10)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_11)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_12)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_13)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_14)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_15)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_16)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_17)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_18)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_19)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_20)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_21)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_22)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_23)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_24)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_25)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_26)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_27)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_28)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_29)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_30)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_31)) + HIDDEN_FUNC(GLOBAL(ashiftrt_r4_32)) + + .align 1 +GLOBAL(ashiftrt_r4_32): +GLOBAL(ashiftrt_r4_31): + rotcl r4 + rts + subc r4,r4 + +GLOBAL(ashiftrt_r4_30): + shar r4 +GLOBAL(ashiftrt_r4_29): + shar r4 +GLOBAL(ashiftrt_r4_28): + shar r4 +GLOBAL(ashiftrt_r4_27): + shar r4 +GLOBAL(ashiftrt_r4_26): + shar r4 +GLOBAL(ashiftrt_r4_25): + shar r4 +GLOBAL(ashiftrt_r4_24): + shlr16 r4 + shlr8 r4 + rts + exts.b r4,r4 + +GLOBAL(ashiftrt_r4_23): + shar r4 +GLOBAL(ashiftrt_r4_22): + shar r4 +GLOBAL(ashiftrt_r4_21): + shar r4 +GLOBAL(ashiftrt_r4_20): + shar r4 +GLOBAL(ashiftrt_r4_19): + shar r4 +GLOBAL(ashiftrt_r4_18): + shar r4 +GLOBAL(ashiftrt_r4_17): + shar r4 +GLOBAL(ashiftrt_r4_16): + shlr16 r4 + rts + exts.w r4,r4 + +GLOBAL(ashiftrt_r4_15): + shar r4 +GLOBAL(ashiftrt_r4_14): + shar r4 +GLOBAL(ashiftrt_r4_13): + shar r4 +GLOBAL(ashiftrt_r4_12): + shar r4 +GLOBAL(ashiftrt_r4_11): + shar r4 +GLOBAL(ashiftrt_r4_10): + shar r4 +GLOBAL(ashiftrt_r4_9): + shar r4 +GLOBAL(ashiftrt_r4_8): + shar r4 +GLOBAL(ashiftrt_r4_7): + shar r4 +GLOBAL(ashiftrt_r4_6): + shar r4 +GLOBAL(ashiftrt_r4_5): + shar r4 +GLOBAL(ashiftrt_r4_4): + shar r4 +GLOBAL(ashiftrt_r4_3): + shar r4 +GLOBAL(ashiftrt_r4_2): + shar r4 +GLOBAL(ashiftrt_r4_1): + rts + shar r4 + +GLOBAL(ashiftrt_r4_0): + rts + nop + + ENDFUNC(GLOBAL(ashiftrt_r4_0)) + ENDFUNC(GLOBAL(ashiftrt_r4_1)) + ENDFUNC(GLOBAL(ashiftrt_r4_2)) + ENDFUNC(GLOBAL(ashiftrt_r4_3)) + ENDFUNC(GLOBAL(ashiftrt_r4_4)) + ENDFUNC(GLOBAL(ashiftrt_r4_5)) + ENDFUNC(GLOBAL(ashiftrt_r4_6)) + ENDFUNC(GLOBAL(ashiftrt_r4_7)) + ENDFUNC(GLOBAL(ashiftrt_r4_8)) + ENDFUNC(GLOBAL(ashiftrt_r4_9)) + ENDFUNC(GLOBAL(ashiftrt_r4_10)) + ENDFUNC(GLOBAL(ashiftrt_r4_11)) + ENDFUNC(GLOBAL(ashiftrt_r4_12)) + ENDFUNC(GLOBAL(ashiftrt_r4_13)) + ENDFUNC(GLOBAL(ashiftrt_r4_14)) + ENDFUNC(GLOBAL(ashiftrt_r4_15)) + ENDFUNC(GLOBAL(ashiftrt_r4_16)) + ENDFUNC(GLOBAL(ashiftrt_r4_17)) + ENDFUNC(GLOBAL(ashiftrt_r4_18)) + ENDFUNC(GLOBAL(ashiftrt_r4_19)) + ENDFUNC(GLOBAL(ashiftrt_r4_20)) + ENDFUNC(GLOBAL(ashiftrt_r4_21)) + ENDFUNC(GLOBAL(ashiftrt_r4_22)) + ENDFUNC(GLOBAL(ashiftrt_r4_23)) + ENDFUNC(GLOBAL(ashiftrt_r4_24)) + ENDFUNC(GLOBAL(ashiftrt_r4_25)) + ENDFUNC(GLOBAL(ashiftrt_r4_26)) + ENDFUNC(GLOBAL(ashiftrt_r4_27)) + ENDFUNC(GLOBAL(ashiftrt_r4_28)) + ENDFUNC(GLOBAL(ashiftrt_r4_29)) + ENDFUNC(GLOBAL(ashiftrt_r4_30)) + ENDFUNC(GLOBAL(ashiftrt_r4_31)) + ENDFUNC(GLOBAL(ashiftrt_r4_32)) +#endif + +#ifdef L_ashiftrt_n + +! +! GLOBAL(ashrsi3) +! +! Entry: +! +! r4: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! + + .global GLOBAL(ashrsi3) + HIDDEN_FUNC(GLOBAL(ashrsi3)) + .align 2 +GLOBAL(ashrsi3): + mov #31,r0 + and r0,r5 + mova LOCAL(ashrsi3_table),r0 + mov.b @(r0,r5),r5 +#ifdef __sh1__ + add r5,r0 + jmp @r0 +#else + braf r5 +#endif + mov r4,r0 + + .align 2 +LOCAL(ashrsi3_table): + .byte LOCAL(ashrsi3_0)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_1)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_2)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_3)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_4)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_5)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_6)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_7)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_8)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_9)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_10)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_11)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_12)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_13)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_14)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_15)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_16)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_17)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_18)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_19)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_20)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_21)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_22)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_23)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_24)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_25)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_26)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_27)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_28)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_29)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_30)-LOCAL(ashrsi3_table) + .byte LOCAL(ashrsi3_31)-LOCAL(ashrsi3_table) + +LOCAL(ashrsi3_31): + rotcl r0 + rts + subc r0,r0 + +LOCAL(ashrsi3_30): + shar r0 +LOCAL(ashrsi3_29): + shar r0 +LOCAL(ashrsi3_28): + shar r0 +LOCAL(ashrsi3_27): + shar r0 +LOCAL(ashrsi3_26): + shar r0 +LOCAL(ashrsi3_25): + shar r0 +LOCAL(ashrsi3_24): + shlr16 r0 + shlr8 r0 + rts + exts.b r0,r0 + +LOCAL(ashrsi3_23): + shar r0 +LOCAL(ashrsi3_22): + shar r0 +LOCAL(ashrsi3_21): + shar r0 +LOCAL(ashrsi3_20): + shar r0 +LOCAL(ashrsi3_19): + shar r0 +LOCAL(ashrsi3_18): + shar r0 +LOCAL(ashrsi3_17): + shar r0 +LOCAL(ashrsi3_16): + shlr16 r0 + rts + exts.w r0,r0 + +LOCAL(ashrsi3_15): + shar r0 +LOCAL(ashrsi3_14): + shar r0 +LOCAL(ashrsi3_13): + shar r0 +LOCAL(ashrsi3_12): + shar r0 +LOCAL(ashrsi3_11): + shar r0 +LOCAL(ashrsi3_10): + shar r0 +LOCAL(ashrsi3_9): + shar r0 +LOCAL(ashrsi3_8): + shar r0 +LOCAL(ashrsi3_7): + shar r0 +LOCAL(ashrsi3_6): + shar r0 +LOCAL(ashrsi3_5): + shar r0 +LOCAL(ashrsi3_4): + shar r0 +LOCAL(ashrsi3_3): + shar r0 +LOCAL(ashrsi3_2): + shar r0 +LOCAL(ashrsi3_1): + rts + shar r0 + +LOCAL(ashrsi3_0): + rts + nop + + ENDFUNC(GLOBAL(ashrsi3)) +#endif + +#ifdef L_ashiftlt + +! +! GLOBAL(ashlsi3) +! +! Entry: +! +! r4: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! + .global GLOBAL(ashlsi3) + HIDDEN_FUNC(GLOBAL(ashlsi3)) + .align 2 +GLOBAL(ashlsi3): + mov #31,r0 + and r0,r5 + mova LOCAL(ashlsi3_table),r0 + mov.b @(r0,r5),r5 +#ifdef __sh1__ + add r5,r0 + jmp @r0 +#else + braf r5 +#endif + mov r4,r0 + + .align 2 +LOCAL(ashlsi3_table): + .byte LOCAL(ashlsi3_0)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_1)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_2)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_3)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_4)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_5)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_6)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_7)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_8)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_9)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_10)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_11)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_12)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_13)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_14)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_15)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_16)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_17)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_18)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_19)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_20)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_21)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_22)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_23)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_24)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_25)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_26)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_27)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_28)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_29)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_30)-LOCAL(ashlsi3_table) + .byte LOCAL(ashlsi3_31)-LOCAL(ashlsi3_table) + +LOCAL(ashlsi3_6): + shll2 r0 +LOCAL(ashlsi3_4): + shll2 r0 +LOCAL(ashlsi3_2): + rts + shll2 r0 + +LOCAL(ashlsi3_7): + shll2 r0 +LOCAL(ashlsi3_5): + shll2 r0 +LOCAL(ashlsi3_3): + shll2 r0 +LOCAL(ashlsi3_1): + rts + shll r0 + +LOCAL(ashlsi3_14): + shll2 r0 +LOCAL(ashlsi3_12): + shll2 r0 +LOCAL(ashlsi3_10): + shll2 r0 +LOCAL(ashlsi3_8): + rts + shll8 r0 + +LOCAL(ashlsi3_15): + shll2 r0 +LOCAL(ashlsi3_13): + shll2 r0 +LOCAL(ashlsi3_11): + shll2 r0 +LOCAL(ashlsi3_9): + shll8 r0 + rts + shll r0 + +LOCAL(ashlsi3_22): + shll2 r0 +LOCAL(ashlsi3_20): + shll2 r0 +LOCAL(ashlsi3_18): + shll2 r0 +LOCAL(ashlsi3_16): + rts + shll16 r0 + +LOCAL(ashlsi3_23): + shll2 r0 +LOCAL(ashlsi3_21): + shll2 r0 +LOCAL(ashlsi3_19): + shll2 r0 +LOCAL(ashlsi3_17): + shll16 r0 + rts + shll r0 + +LOCAL(ashlsi3_30): + shll2 r0 +LOCAL(ashlsi3_28): + shll2 r0 +LOCAL(ashlsi3_26): + shll2 r0 +LOCAL(ashlsi3_24): + shll16 r0 + rts + shll8 r0 + +LOCAL(ashlsi3_31): + shll2 r0 +LOCAL(ashlsi3_29): + shll2 r0 +LOCAL(ashlsi3_27): + shll2 r0 +LOCAL(ashlsi3_25): + shll16 r0 + shll8 r0 + rts + shll r0 + +LOCAL(ashlsi3_0): + rts + nop + + ENDFUNC(GLOBAL(ashlsi3)) +#endif + +#ifdef L_lshiftrt + +! +! GLOBAL(lshrsi3) +! +! Entry: +! +! r4: Value to shift +! r5: Shifts +! +! Exit: +! +! r0: Result +! +! Destroys: +! +! (none) +! + .global GLOBAL(lshrsi3) + HIDDEN_FUNC(GLOBAL(lshrsi3)) + .align 2 +GLOBAL(lshrsi3): + mov #31,r0 + and r0,r5 + mova LOCAL(lshrsi3_table),r0 + mov.b @(r0,r5),r5 +#ifdef __sh1__ + add r5,r0 + jmp @r0 +#else + braf r5 +#endif + mov r4,r0 + + .align 2 +LOCAL(lshrsi3_table): + .byte LOCAL(lshrsi3_0)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_1)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_2)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_3)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_4)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_5)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_6)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_7)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_8)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_9)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_10)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_11)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_12)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_13)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_14)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_15)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_16)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_17)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_18)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_19)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_20)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_21)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_22)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_23)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_24)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_25)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_26)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_27)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_28)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_29)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_30)-LOCAL(lshrsi3_table) + .byte LOCAL(lshrsi3_31)-LOCAL(lshrsi3_table) + +LOCAL(lshrsi3_6): + shlr2 r0 +LOCAL(lshrsi3_4): + shlr2 r0 +LOCAL(lshrsi3_2): + rts + shlr2 r0 + +LOCAL(lshrsi3_7): + shlr2 r0 +LOCAL(lshrsi3_5): + shlr2 r0 +LOCAL(lshrsi3_3): + shlr2 r0 +LOCAL(lshrsi3_1): + rts + shlr r0 + +LOCAL(lshrsi3_14): + shlr2 r0 +LOCAL(lshrsi3_12): + shlr2 r0 +LOCAL(lshrsi3_10): + shlr2 r0 +LOCAL(lshrsi3_8): + rts + shlr8 r0 + +LOCAL(lshrsi3_15): + shlr2 r0 +LOCAL(lshrsi3_13): + shlr2 r0 +LOCAL(lshrsi3_11): + shlr2 r0 +LOCAL(lshrsi3_9): + shlr8 r0 + rts + shlr r0 + +LOCAL(lshrsi3_22): + shlr2 r0 +LOCAL(lshrsi3_20): + shlr2 r0 +LOCAL(lshrsi3_18): + shlr2 r0 +LOCAL(lshrsi3_16): + rts + shlr16 r0 + +LOCAL(lshrsi3_23): + shlr2 r0 +LOCAL(lshrsi3_21): + shlr2 r0 +LOCAL(lshrsi3_19): + shlr2 r0 +LOCAL(lshrsi3_17): + shlr16 r0 + rts + shlr r0 + +LOCAL(lshrsi3_30): + shlr2 r0 +LOCAL(lshrsi3_28): + shlr2 r0 +LOCAL(lshrsi3_26): + shlr2 r0 +LOCAL(lshrsi3_24): + shlr16 r0 + rts + shlr8 r0 + +LOCAL(lshrsi3_31): + shlr2 r0 +LOCAL(lshrsi3_29): + shlr2 r0 +LOCAL(lshrsi3_27): + shlr2 r0 +LOCAL(lshrsi3_25): + shlr16 r0 + shlr8 r0 + rts + shlr r0 + +LOCAL(lshrsi3_0): + rts + nop + + ENDFUNC(GLOBAL(lshrsi3)) +#endif + +#ifdef L_movmem + .text + .balign 4 + .global GLOBAL(movmem) + HIDDEN_FUNC(GLOBAL(movmem)) + HIDDEN_ALIAS(movstr,movmem) + /* This would be a lot simpler if r6 contained the byte count + minus 64, and we wouldn't be called here for a byte count of 64. */ +GLOBAL(movmem): + sts.l pr,@-r15 + shll2 r6 + bsr GLOBAL(movmemSI52+2) + mov.l @(48,r5),r0 + .balign 4 +LOCAL(movmem_loop): /* Reached with rts */ + mov.l @(60,r5),r0 + add #-64,r6 + mov.l r0,@(60,r4) + tst r6,r6 + mov.l @(56,r5),r0 + bt LOCAL(movmem_done) + mov.l r0,@(56,r4) + cmp/pl r6 + mov.l @(52,r5),r0 + add #64,r5 + mov.l r0,@(52,r4) + add #64,r4 + bt GLOBAL(movmemSI52) +! done all the large groups, do the remainder +! jump to movmem+ + mova GLOBAL(movmemSI4)+4,r0 + add r6,r0 + jmp @r0 +LOCAL(movmem_done): ! share slot insn, works out aligned. + lds.l @r15+,pr + mov.l r0,@(56,r4) + mov.l @(52,r5),r0 + rts + mov.l r0,@(52,r4) + .balign 4 +! ??? We need aliases movstr* for movmem* for the older libraries. These +! aliases will be removed at the some point in the future. + .global GLOBAL(movmemSI64) + HIDDEN_FUNC(GLOBAL(movmemSI64)) + HIDDEN_ALIAS(movstrSI64,movmemSI64) +GLOBAL(movmemSI64): + mov.l @(60,r5),r0 + mov.l r0,@(60,r4) + .global GLOBAL(movmemSI60) + HIDDEN_FUNC(GLOBAL(movmemSI60)) + HIDDEN_ALIAS(movstrSI60,movmemSI60) +GLOBAL(movmemSI60): + mov.l @(56,r5),r0 + mov.l r0,@(56,r4) + .global GLOBAL(movmemSI56) + HIDDEN_FUNC(GLOBAL(movmemSI56)) + HIDDEN_ALIAS(movstrSI56,movmemSI56) +GLOBAL(movmemSI56): + mov.l @(52,r5),r0 + mov.l r0,@(52,r4) + .global GLOBAL(movmemSI52) + HIDDEN_FUNC(GLOBAL(movmemSI52)) + HIDDEN_ALIAS(movstrSI52,movmemSI52) +GLOBAL(movmemSI52): + mov.l @(48,r5),r0 + mov.l r0,@(48,r4) + .global GLOBAL(movmemSI48) + HIDDEN_FUNC(GLOBAL(movmemSI48)) + HIDDEN_ALIAS(movstrSI48,movmemSI48) +GLOBAL(movmemSI48): + mov.l @(44,r5),r0 + mov.l r0,@(44,r4) + .global GLOBAL(movmemSI44) + HIDDEN_FUNC(GLOBAL(movmemSI44)) + HIDDEN_ALIAS(movstrSI44,movmemSI44) +GLOBAL(movmemSI44): + mov.l @(40,r5),r0 + mov.l r0,@(40,r4) + .global GLOBAL(movmemSI40) + HIDDEN_FUNC(GLOBAL(movmemSI40)) + HIDDEN_ALIAS(movstrSI40,movmemSI40) +GLOBAL(movmemSI40): + mov.l @(36,r5),r0 + mov.l r0,@(36,r4) + .global GLOBAL(movmemSI36) + HIDDEN_FUNC(GLOBAL(movmemSI36)) + HIDDEN_ALIAS(movstrSI36,movmemSI36) +GLOBAL(movmemSI36): + mov.l @(32,r5),r0 + mov.l r0,@(32,r4) + .global GLOBAL(movmemSI32) + HIDDEN_FUNC(GLOBAL(movmemSI32)) + HIDDEN_ALIAS(movstrSI32,movmemSI32) +GLOBAL(movmemSI32): + mov.l @(28,r5),r0 + mov.l r0,@(28,r4) + .global GLOBAL(movmemSI28) + HIDDEN_FUNC(GLOBAL(movmemSI28)) + HIDDEN_ALIAS(movstrSI28,movmemSI28) +GLOBAL(movmemSI28): + mov.l @(24,r5),r0 + mov.l r0,@(24,r4) + .global GLOBAL(movmemSI24) + HIDDEN_FUNC(GLOBAL(movmemSI24)) + HIDDEN_ALIAS(movstrSI24,movmemSI24) +GLOBAL(movmemSI24): + mov.l @(20,r5),r0 + mov.l r0,@(20,r4) + .global GLOBAL(movmemSI20) + HIDDEN_FUNC(GLOBAL(movmemSI20)) + HIDDEN_ALIAS(movstrSI20,movmemSI20) +GLOBAL(movmemSI20): + mov.l @(16,r5),r0 + mov.l r0,@(16,r4) + .global GLOBAL(movmemSI16) + HIDDEN_FUNC(GLOBAL(movmemSI16)) + HIDDEN_ALIAS(movstrSI16,movmemSI16) +GLOBAL(movmemSI16): + mov.l @(12,r5),r0 + mov.l r0,@(12,r4) + .global GLOBAL(movmemSI12) + HIDDEN_FUNC(GLOBAL(movmemSI12)) + HIDDEN_ALIAS(movstrSI12,movmemSI12) +GLOBAL(movmemSI12): + mov.l @(8,r5),r0 + mov.l r0,@(8,r4) + .global GLOBAL(movmemSI8) + HIDDEN_FUNC(GLOBAL(movmemSI8)) + HIDDEN_ALIAS(movstrSI8,movmemSI8) +GLOBAL(movmemSI8): + mov.l @(4,r5),r0 + mov.l r0,@(4,r4) + .global GLOBAL(movmemSI4) + HIDDEN_FUNC(GLOBAL(movmemSI4)) + HIDDEN_ALIAS(movstrSI4,movmemSI4) +GLOBAL(movmemSI4): + mov.l @(0,r5),r0 + rts + mov.l r0,@(0,r4) + + ENDFUNC(GLOBAL(movmemSI64)) + ENDFUNC(GLOBAL(movmemSI60)) + ENDFUNC(GLOBAL(movmemSI56)) + ENDFUNC(GLOBAL(movmemSI52)) + ENDFUNC(GLOBAL(movmemSI48)) + ENDFUNC(GLOBAL(movmemSI44)) + ENDFUNC(GLOBAL(movmemSI40)) + ENDFUNC(GLOBAL(movmemSI36)) + ENDFUNC(GLOBAL(movmemSI32)) + ENDFUNC(GLOBAL(movmemSI28)) + ENDFUNC(GLOBAL(movmemSI24)) + ENDFUNC(GLOBAL(movmemSI20)) + ENDFUNC(GLOBAL(movmemSI16)) + ENDFUNC(GLOBAL(movmemSI12)) + ENDFUNC(GLOBAL(movmemSI8)) + ENDFUNC(GLOBAL(movmemSI4)) + ENDFUNC(GLOBAL(movmem)) +#endif + +#ifdef L_movmem_i4 + .text + .global GLOBAL(movmem_i4_even) + .global GLOBAL(movmem_i4_odd) + .global GLOBAL(movmemSI12_i4) + + HIDDEN_FUNC(GLOBAL(movmem_i4_even)) + HIDDEN_FUNC(GLOBAL(movmem_i4_odd)) + HIDDEN_FUNC(GLOBAL(movmemSI12_i4)) + + HIDDEN_ALIAS(movstr_i4_even,movmem_i4_even) + HIDDEN_ALIAS(movstr_i4_odd,movmem_i4_odd) + HIDDEN_ALIAS(movstrSI12_i4,movmemSI12_i4) + + .p2align 5 +L_movmem_2mod4_end: + mov.l r0,@(16,r4) + rts + mov.l r1,@(20,r4) + + .p2align 2 + +GLOBAL(movmem_i4_even): + mov.l @r5+,r0 + bra L_movmem_start_even + mov.l @r5+,r1 + +GLOBAL(movmem_i4_odd): + mov.l @r5+,r1 + add #-4,r4 + mov.l @r5+,r2 + mov.l @r5+,r3 + mov.l r1,@(4,r4) + mov.l r2,@(8,r4) + +L_movmem_loop: + mov.l r3,@(12,r4) + dt r6 + mov.l @r5+,r0 + bt/s L_movmem_2mod4_end + mov.l @r5+,r1 + add #16,r4 +L_movmem_start_even: + mov.l @r5+,r2 + mov.l @r5+,r3 + mov.l r0,@r4 + dt r6 + mov.l r1,@(4,r4) + bf/s L_movmem_loop + mov.l r2,@(8,r4) + rts + mov.l r3,@(12,r4) + + ENDFUNC(GLOBAL(movmem_i4_even)) + ENDFUNC(GLOBAL(movmem_i4_odd)) + + .p2align 4 +GLOBAL(movmemSI12_i4): + mov.l @r5,r0 + mov.l @(4,r5),r1 + mov.l @(8,r5),r2 + mov.l r0,@r4 + mov.l r1,@(4,r4) + rts + mov.l r2,@(8,r4) + + ENDFUNC(GLOBAL(movmemSI12_i4)) +#endif + +#ifdef L_mulsi3 + + + .global GLOBAL(mulsi3) + HIDDEN_FUNC(GLOBAL(mulsi3)) + +! r4 = aabb +! r5 = ccdd +! r0 = aabb*ccdd via partial products +! +! if aa == 0 and cc = 0 +! r0 = bb*dd +! +! else +! aa = bb*dd + (aa*dd*65536) + (cc*bb*65536) +! + +GLOBAL(mulsi3): + mulu.w r4,r5 ! multiply the lsws macl=bb*dd + mov r5,r3 ! r3 = ccdd + swap.w r4,r2 ! r2 = bbaa + xtrct r2,r3 ! r3 = aacc + tst r3,r3 ! msws zero ? + bf hiset + rts ! yes - then we have the answer + sts macl,r0 + +hiset: sts macl,r0 ! r0 = bb*dd + mulu.w r2,r5 ! brewing macl = aa*dd + sts macl,r1 + mulu.w r3,r4 ! brewing macl = cc*bb + sts macl,r2 + add r1,r2 + shll16 r2 + rts + add r2,r0 + + ENDFUNC(GLOBAL(mulsi3)) +#endif +#endif /* ! __SH5__ */ +#ifdef L_sdivsi3_i4 + .title "SH DIVIDE" +!! 4 byte integer Divide code for the Renesas SH +#ifdef __SH4__ +!! args in r4 and r5, result in fpul, clobber dr0, dr2 + + .global GLOBAL(sdivsi3_i4) + HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) +GLOBAL(sdivsi3_i4): + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + rts + ftrc dr0,fpul + + ENDFUNC(GLOBAL(sdivsi3_i4)) +#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__) +!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2 + +#if ! __SH5__ || __SH5__ == 32 +#if __SH5__ + .mode SHcompact +#endif + .global GLOBAL(sdivsi3_i4) + HIDDEN_FUNC(GLOBAL(sdivsi3_i4)) +GLOBAL(sdivsi3_i4): + sts.l fpscr,@-r15 + mov #8,r2 + swap.w r2,r2 + lds r2,fpscr + lds r4,fpul + float fpul,dr0 + lds r5,fpul + float fpul,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + lds.l @r15+,fpscr + + ENDFUNC(GLOBAL(sdivsi3_i4)) +#endif /* ! __SH5__ || __SH5__ == 32 */ +#endif /* ! __SH4__ */ +#endif + +#ifdef L_sdivsi3 +/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with + sh2e/sh3e code. */ +#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__) +!! +!! Steve Chamberlain +!! sac@cygnus.com +!! +!! + +!! args in r4 and r5, result in r0 clobber r1, r2, r3, and t bit + + .global GLOBAL(sdivsi3) +#if __SHMEDIA__ +#if __SH5__ == 32 + .section .text..SHmedia32,"ax" +#else + .text +#endif + .align 2 +#if 0 +/* The assembly code that follows is a hand-optimized version of the C + code that follows. Note that the registers that are modified are + exactly those listed as clobbered in the patterns divsi3_i1 and + divsi3_i1_media. + +int __sdivsi3 (i, j) + int i, j; +{ + register unsigned long long r18 asm ("r18"); + register unsigned long long r19 asm ("r19"); + register unsigned long long r0 asm ("r0") = 0; + register unsigned long long r1 asm ("r1") = 1; + register int r2 asm ("r2") = i >> 31; + register int r3 asm ("r3") = j >> 31; + + r2 = r2 ? r2 : r1; + r3 = r3 ? r3 : r1; + r18 = i * r2; + r19 = j * r3; + r2 *= r3; + + r19 <<= 31; + r1 <<= 31; + do + if (r18 >= r19) + r0 |= r1, r18 -= r19; + while (r19 >>= 1, r1 >>= 1); + + return r2 * (int)r0; +} +*/ +GLOBAL(sdivsi3): + pt/l LOCAL(sdivsi3_dontadd), tr2 + pt/l LOCAL(sdivsi3_loop), tr1 + ptabs/l r18, tr0 + movi 0, r0 + movi 1, r1 + shari.l r4, 31, r2 + shari.l r5, 31, r3 + cmveq r2, r1, r2 + cmveq r3, r1, r3 + muls.l r4, r2, r18 + muls.l r5, r3, r19 + muls.l r2, r3, r2 + shlli r19, 31, r19 + shlli r1, 31, r1 +LOCAL(sdivsi3_loop): + bgtu r19, r18, tr2 + or r0, r1, r0 + sub r18, r19, r18 +LOCAL(sdivsi3_dontadd): + shlri r1, 1, r1 + shlri r19, 1, r19 + bnei r1, 0, tr1 + muls.l r0, r2, r0 + add.l r0, r63, r0 + blink tr0, r63 +#elif 0 /* ! 0 */ + // inputs: r4,r5 + // clobbered: r1,r2,r3,r18,r19,r20,r21,r25,tr0 + // result in r0 +GLOBAL(sdivsi3): + // can create absolute value without extra latency, + // but dependent on proper sign extension of inputs: + // shari.l r5,31,r2 + // xor r5,r2,r20 + // sub r20,r2,r20 // r20 is now absolute value of r5, zero-extended. + shari.l r5,31,r2 + ori r2,1,r2 + muls.l r5,r2,r20 // r20 is now absolute value of r5, zero-extended. + movi 0xffffffffffffbb0c,r19 // shift count eqiv 76 + shari.l r4,31,r3 + nsb r20,r0 + shlld r20,r0,r25 + shlri r25,48,r25 + sub r19,r25,r1 + mmulfx.w r1,r1,r2 + mshflo.w r1,r63,r1 + // If r4 was to be used in-place instead of r21, could use this sequence + // to compute absolute: + // sub r63,r4,r19 // compute absolute value of r4 + // shlri r4,32,r3 // into lower 32 bit of r4, keeping + // mcmv r19,r3,r4 // the sign in the upper 32 bits intact. + ori r3,1,r3 + mmulfx.w r25,r2,r2 + sub r19,r0,r0 + muls.l r4,r3,r21 + msub.w r1,r2,r2 + addi r2,-2,r1 + mulu.l r21,r1,r19 + mmulfx.w r2,r2,r2 + shlli r1,15,r1 + shlrd r19,r0,r19 + mulu.l r19,r20,r3 + mmacnfx.wl r25,r2,r1 + ptabs r18,tr0 + sub r21,r3,r25 + + mulu.l r25,r1,r2 + addi r0,14,r0 + xor r4,r5,r18 + shlrd r2,r0,r2 + mulu.l r2,r20,r3 + add r19,r2,r19 + shari.l r18,31,r18 + sub r25,r3,r25 + + mulu.l r25,r1,r2 + sub r25,r20,r25 + add r19,r18,r19 + shlrd r2,r0,r2 + mulu.l r2,r20,r3 + addi r25,1,r25 + add r19,r2,r19 + + cmpgt r25,r3,r25 + add.l r19,r25,r0 + xor r0,r18,r0 + blink tr0,r63 +#else /* ! 0 && ! 0 */ + + // inputs: r4,r5 + // clobbered: r1,r18,r19,r20,r21,r25,tr0 + // result in r0 + HIDDEN_FUNC(GLOBAL(sdivsi3_2)) +#ifndef __pic__ + FUNC(GLOBAL(sdivsi3)) +GLOBAL(sdivsi3): /* this is the shcompact entry point */ + // The special SHmedia entry point sdivsi3_1 prevents accidental linking + // with the SHcompact implementation, which clobbers tr1 / tr2. + .global GLOBAL(sdivsi3_1) +GLOBAL(sdivsi3_1): + .global GLOBAL(div_table_internal) + movi (GLOBAL(div_table_internal) >> 16) & 65535, r20 + shori GLOBAL(div_table_internal) & 65535, r20 +#endif + .global GLOBAL(sdivsi3_2) + // div_table in r20 + // clobbered: r1,r18,r19,r21,r25,tr0 +GLOBAL(sdivsi3_2): + nsb r5, r1 + shlld r5, r1, r25 // normalize; [-2 ..1, 1..2) in s2.62 + shari r25, 58, r21 // extract 5(6) bit index (s2.4 with hole -1..1) + ldx.ub r20, r21, r19 // u0.8 + shari r25, 32, r25 // normalize to s2.30 + shlli r21, 1, r21 + muls.l r25, r19, r19 // s2.38 + ldx.w r20, r21, r21 // s2.14 + ptabs r18, tr0 + shari r19, 24, r19 // truncate to s2.14 + sub r21, r19, r19 // some 11 bit inverse in s1.14 + muls.l r19, r19, r21 // u0.28 + sub r63, r1, r1 + addi r1, 92, r1 + muls.l r25, r21, r18 // s2.58 + shlli r19, 45, r19 // multiply by two and convert to s2.58 + /* bubble */ + sub r19, r18, r18 + shari r18, 28, r18 // some 22 bit inverse in s1.30 + muls.l r18, r25, r0 // s2.60 + muls.l r18, r4, r25 // s32.30 + /* bubble */ + shari r0, 16, r19 // s-16.44 + muls.l r19, r18, r19 // s-16.74 + shari r25, 63, r0 + shari r4, 14, r18 // s19.-14 + shari r19, 30, r19 // s-16.44 + muls.l r19, r18, r19 // s15.30 + xor r21, r0, r21 // You could also use the constant 1 << 27. + add r21, r25, r21 + sub r21, r19, r21 + shard r21, r1, r21 + sub r21, r0, r0 + blink tr0, r63 +#ifndef __pic__ + ENDFUNC(GLOBAL(sdivsi3)) +#endif + ENDFUNC(GLOBAL(sdivsi3_2)) +#endif +#elif defined __SHMEDIA__ +/* m5compact-nofpu */ + // clobbered: r18,r19,r20,r21,r25,tr0,tr1,tr2 + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + FUNC(GLOBAL(sdivsi3)) +GLOBAL(sdivsi3): + pt/l LOCAL(sdivsi3_dontsub), tr0 + pt/l LOCAL(sdivsi3_loop), tr1 + ptabs/l r18,tr2 + shari.l r4,31,r18 + shari.l r5,31,r19 + xor r4,r18,r20 + xor r5,r19,r21 + sub.l r20,r18,r20 + sub.l r21,r19,r21 + xor r18,r19,r19 + shlli r21,32,r25 + addi r25,-1,r21 + addz.l r20,r63,r20 +LOCAL(sdivsi3_loop): + shlli r20,1,r20 + bgeu/u r21,r20,tr0 + sub r20,r21,r20 +LOCAL(sdivsi3_dontsub): + addi.l r25,-1,r25 + bnei r25,-32,tr1 + xor r20,r19,r20 + sub.l r20,r19,r0 + blink tr2,r63 + ENDFUNC(GLOBAL(sdivsi3)) +#else /* ! __SHMEDIA__ */ + FUNC(GLOBAL(sdivsi3)) +GLOBAL(sdivsi3): + mov r4,r1 + mov r5,r0 + + tst r0,r0 + bt div0 + mov #0,r2 + div0s r2,r1 + subc r3,r3 + subc r2,r1 + div0s r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + div1 r0,r3 + rotcl r1 + addc r2,r1 + rts + mov r1,r0 + + +div0: rts + mov #0,r0 + + ENDFUNC(GLOBAL(sdivsi3)) +#endif /* ! __SHMEDIA__ */ +#endif /* ! __SH4__ */ +#endif +#ifdef L_udivsi3_i4 + + .title "SH DIVIDE" +!! 4 byte integer Divide code for the Renesas SH +#ifdef __SH4__ +!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4, +!! and t bit + + .global GLOBAL(udivsi3_i4) + HIDDEN_FUNC(GLOBAL(udivsi3_i4)) +GLOBAL(udivsi3_i4): + mov #1,r1 + cmp/hi r1,r5 + bf trivial + rotr r1 + xor r1,r4 + lds r4,fpul + mova L1,r0 +#ifdef FMOVD_WORKS + fmov.d @r0+,dr4 +#else + fmov.s @r0+,DR40 + fmov.s @r0,DR41 +#endif + float fpul,dr0 + xor r1,r5 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + rts + ftrc dr0,fpul + +trivial: + rts + lds r4,fpul + + .align 2 +#ifdef FMOVD_WORKS + .align 3 ! make double below 8 byte aligned. +#endif +L1: + .double 2147483648 + + ENDFUNC(GLOBAL(udivsi3_i4)) +#elif defined (__SH5__) && ! defined (__SH4_NOFPU__) +#if ! __SH5__ || __SH5__ == 32 +!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33 + .mode SHmedia + .global GLOBAL(udivsi3_i4) + HIDDEN_FUNC(GLOBAL(udivsi3_i4)) +GLOBAL(udivsi3_i4): + addz.l r4,r63,r20 + addz.l r5,r63,r21 + fmov.qd r20,dr0 + fmov.qd r21,dr32 + ptabs r18,tr0 + float.qd dr0,dr0 + float.qd dr32,dr32 + fdiv.d dr0,dr32,dr0 + ftrc.dq dr0,dr32 + fmov.s fr33,fr32 + blink tr0,r63 + + ENDFUNC(GLOBAL(udivsi3_i4)) +#endif /* ! __SH5__ || __SH5__ == 32 */ +#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) +!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4 + + .global GLOBAL(udivsi3_i4) + HIDDEN_FUNC(GLOBAL(udivsi3_i4)) +GLOBAL(udivsi3_i4): + mov #1,r1 + cmp/hi r1,r5 + bf trivial + sts.l fpscr,@-r15 + mova L1,r0 + lds.l @r0+,fpscr + rotr r1 + xor r1,r4 + lds r4,fpul +#ifdef FMOVD_WORKS + fmov.d @r0+,dr4 +#else + fmov.s @r0+,DR40 + fmov.s @r0,DR41 +#endif + float fpul,dr0 + xor r1,r5 + lds r5,fpul + float fpul,dr2 + fadd dr4,dr0 + fadd dr4,dr2 + fdiv dr2,dr0 + ftrc dr0,fpul + rts + lds.l @r15+,fpscr + +#ifdef FMOVD_WORKS + .align 3 ! make double below 8 byte aligned. +#endif +trivial: + rts + lds r4,fpul + + .align 2 +L1: +#ifndef FMOVD_WORKS + .long 0x80000 +#else + .long 0x180000 +#endif + .double 2147483648 + + ENDFUNC(GLOBAL(udivsi3_i4)) +#endif /* ! __SH4__ */ +#endif + +#ifdef L_udivsi3 +/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with + sh2e/sh3e code. */ +#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__) + +!! args in r4 and r5, result in r0, clobbers r4, pr, and t bit + .global GLOBAL(udivsi3) + HIDDEN_FUNC(GLOBAL(udivsi3)) + +#if __SHMEDIA__ +#if __SH5__ == 32 + .section .text..SHmedia32,"ax" +#else + .text +#endif + .align 2 +#if 0 +/* The assembly code that follows is a hand-optimized version of the C + code that follows. Note that the registers that are modified are + exactly those listed as clobbered in the patterns udivsi3_i1 and + udivsi3_i1_media. + +unsigned +__udivsi3 (i, j) + unsigned i, j; +{ + register unsigned long long r0 asm ("r0") = 0; + register unsigned long long r18 asm ("r18") = 1; + register unsigned long long r4 asm ("r4") = i; + register unsigned long long r19 asm ("r19") = j; + + r19 <<= 31; + r18 <<= 31; + do + if (r4 >= r19) + r0 |= r18, r4 -= r19; + while (r19 >>= 1, r18 >>= 1); + + return r0; +} +*/ +GLOBAL(udivsi3): + pt/l LOCAL(udivsi3_dontadd), tr2 + pt/l LOCAL(udivsi3_loop), tr1 + ptabs/l r18, tr0 + movi 0, r0 + movi 1, r18 + addz.l r5, r63, r19 + addz.l r4, r63, r4 + shlli r19, 31, r19 + shlli r18, 31, r18 +LOCAL(udivsi3_loop): + bgtu r19, r4, tr2 + or r0, r18, r0 + sub r4, r19, r4 +LOCAL(udivsi3_dontadd): + shlri r18, 1, r18 + shlri r19, 1, r19 + bnei r18, 0, tr1 + blink tr0, r63 +#else +GLOBAL(udivsi3): + // inputs: r4,r5 + // clobbered: r18,r19,r20,r21,r22,r25,tr0 + // result in r0. + addz.l r5,r63,r22 + nsb r22,r0 + shlld r22,r0,r25 + shlri r25,48,r25 + movi 0xffffffffffffbb0c,r20 // shift count eqiv 76 + sub r20,r25,r21 + mmulfx.w r21,r21,r19 + mshflo.w r21,r63,r21 + ptabs r18,tr0 + mmulfx.w r25,r19,r19 + sub r20,r0,r0 + /* bubble */ + msub.w r21,r19,r19 + addi r19,-2,r21 /* It would be nice for scheduling to do this add to r21 + before the msub.w, but we need a different value for + r19 to keep errors under control. */ + mulu.l r4,r21,r18 + mmulfx.w r19,r19,r19 + shlli r21,15,r21 + shlrd r18,r0,r18 + mulu.l r18,r22,r20 + mmacnfx.wl r25,r19,r21 + /* bubble */ + sub r4,r20,r25 + + mulu.l r25,r21,r19 + addi r0,14,r0 + /* bubble */ + shlrd r19,r0,r19 + mulu.l r19,r22,r20 + add r18,r19,r18 + /* bubble */ + sub.l r25,r20,r25 + + mulu.l r25,r21,r19 + addz.l r25,r63,r25 + sub r25,r22,r25 + shlrd r19,r0,r19 + mulu.l r19,r22,r20 + addi r25,1,r25 + add r18,r19,r18 + + cmpgt r25,r20,r25 + add.l r18,r25,r0 + blink tr0,r63 +#endif +#elif defined (__SHMEDIA__) +/* m5compact-nofpu - more emphasis on code size than on speed, but don't + ignore speed altogether - div1 needs 9 cycles, subc 7 and rotcl 4. + So use a short shmedia loop. */ + // clobbered: r20,r21,r25,tr0,tr1,tr2 + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 +GLOBAL(udivsi3): + pt/l LOCAL(udivsi3_dontsub), tr0 + pt/l LOCAL(udivsi3_loop), tr1 + ptabs/l r18,tr2 + shlli r5,32,r25 + addi r25,-1,r21 + addz.l r4,r63,r20 +LOCAL(udivsi3_loop): + shlli r20,1,r20 + bgeu/u r21,r20,tr0 + sub r20,r21,r20 +LOCAL(udivsi3_dontsub): + addi.l r25,-1,r25 + bnei r25,-32,tr1 + add.l r20,r63,r0 + blink tr2,r63 +#else /* ! defined (__SHMEDIA__) */ +LOCAL(div8): + div1 r5,r4 +LOCAL(div7): + div1 r5,r4; div1 r5,r4; div1 r5,r4 + div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4 + +LOCAL(divx4): + div1 r5,r4; rotcl r0 + div1 r5,r4; rotcl r0 + div1 r5,r4; rotcl r0 + rts; div1 r5,r4 + +GLOBAL(udivsi3): + sts.l pr,@-r15 + extu.w r5,r0 + cmp/eq r5,r0 +#ifdef __sh1__ + bf LOCAL(large_divisor) +#else + bf/s LOCAL(large_divisor) +#endif + div0u + swap.w r4,r0 + shlr16 r4 + bsr LOCAL(div8) + shll16 r5 + bsr LOCAL(div7) + div1 r5,r4 + xtrct r4,r0 + xtrct r0,r4 + bsr LOCAL(div8) + swap.w r4,r4 + bsr LOCAL(div7) + div1 r5,r4 + lds.l @r15+,pr + xtrct r4,r0 + swap.w r0,r0 + rotcl r0 + rts + shlr16 r5 + +LOCAL(large_divisor): +#ifdef __sh1__ + div0u +#endif + mov #0,r0 + xtrct r4,r0 + xtrct r0,r4 + bsr LOCAL(divx4) + rotcl r0 + bsr LOCAL(divx4) + rotcl r0 + bsr LOCAL(divx4) + rotcl r0 + bsr LOCAL(divx4) + rotcl r0 + lds.l @r15+,pr + rts + rotcl r0 + + ENDFUNC(GLOBAL(udivsi3)) +#endif /* ! __SHMEDIA__ */ +#endif /* __SH4__ */ +#endif /* L_udivsi3 */ + +#ifdef L_udivdi3 +#ifdef __SHMEDIA__ + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(udivdi3) + FUNC(GLOBAL(udivdi3)) +GLOBAL(udivdi3): + HIDDEN_ALIAS(udivdi3_internal,udivdi3) + shlri r3,1,r4 + nsb r4,r22 + shlld r3,r22,r6 + shlri r6,49,r5 + movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ + sub r21,r5,r1 + mmulfx.w r1,r1,r4 + mshflo.w r1,r63,r1 + sub r63,r22,r20 // r63 == 64 % 64 + mmulfx.w r5,r4,r4 + pta LOCAL(large_divisor),tr0 + addi r20,32,r9 + msub.w r1,r4,r1 + madd.w r1,r1,r1 + mmulfx.w r1,r1,r4 + shlri r6,32,r7 + bgt/u r9,r63,tr0 // large_divisor + mmulfx.w r5,r4,r4 + shlri r2,32+14,r19 + addi r22,-31,r0 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r19,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + mulu.l r5,r3,r8 + mshalds.l r1,r21,r1 + shari r4,26,r4 + shlld r8,r0,r8 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r2,r8,r2 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ + + shlri r2,22,r21 + mulu.l r21,r1,r21 + shlld r5,r0,r8 + addi r20,30-22,r0 + shlrd r21,r0,r21 + mulu.l r21,r3,r5 + add r8,r21,r8 + mcmpgt.l r21,r63,r21 // See Note 1 + addi r20,30,r0 + mshfhi.l r63,r21,r21 + sub r2,r5,r2 + andc r2,r21,r2 + + /* small divisor: need a third divide step */ + mulu.l r2,r1,r7 + ptabs r18,tr0 + addi r2,1,r2 + shlrd r7,r0,r7 + mulu.l r7,r3,r5 + add r8,r7,r8 + sub r2,r3,r2 + cmpgt r2,r5,r5 + add r8,r5,r2 + /* could test r3 here to check for divide by zero. */ + blink tr0,r63 + +LOCAL(large_divisor): + mmulfx.w r5,r4,r4 + shlrd r2,r9,r25 + shlri r25,32,r8 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r8,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + shlri r5,14-1,r8 + mulu.l r8,r7,r5 + mshalds.l r1,r21,r1 + shari r4,26,r4 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r25,r5,r25 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ + + shlri r25,22,r21 + mulu.l r21,r1,r21 + pta LOCAL(no_lo_adj),tr0 + addi r22,32,r0 + shlri r21,40,r21 + mulu.l r21,r7,r5 + add r8,r21,r8 + shlld r2,r0,r2 + sub r25,r5,r25 + bgtu/u r7,r25,tr0 // no_lo_adj + addi r8,1,r8 + sub r25,r7,r25 +LOCAL(no_lo_adj): + mextr4 r2,r25,r2 + + /* large_divisor: only needs a few adjustments. */ + mulu.l r8,r6,r5 + ptabs r18,tr0 + /* bubble */ + cmpgtu r5,r2,r5 + sub r8,r5,r2 + blink tr0,r63 + ENDFUNC(GLOBAL(udivdi3)) +/* Note 1: To shift the result of the second divide stage so that the result + always fits into 32 bits, yet we still reduce the rest sufficiently + would require a lot of instructions to do the shifts just right. Using + the full 64 bit shift result to multiply with the divisor would require + four extra instructions for the upper 32 bits (shift / mulu / shift / sub). + Fortunately, if the upper 32 bits of the shift result are nonzero, we + know that the rest after taking this partial result into account will + fit into 32 bits. So we just clear the upper 32 bits of the rest if the + upper 32 bits of the partial result are nonzero. */ +#endif /* __SHMEDIA__ */ +#endif /* L_udivdi3 */ + +#ifdef L_divdi3 +#ifdef __SHMEDIA__ + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(divdi3) + FUNC(GLOBAL(divdi3)) +GLOBAL(divdi3): + pta GLOBAL(udivdi3_internal),tr0 + shari r2,63,r22 + shari r3,63,r23 + xor r2,r22,r2 + xor r3,r23,r3 + sub r2,r22,r2 + sub r3,r23,r3 + beq/u r22,r23,tr0 + ptabs r18,tr1 + blink tr0,r18 + sub r63,r2,r2 + blink tr1,r63 + ENDFUNC(GLOBAL(divdi3)) +#endif /* __SHMEDIA__ */ +#endif /* L_divdi3 */ + +#ifdef L_umoddi3 +#ifdef __SHMEDIA__ + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(umoddi3) + FUNC(GLOBAL(umoddi3)) +GLOBAL(umoddi3): + HIDDEN_ALIAS(umoddi3_internal,umoddi3) + shlri r3,1,r4 + nsb r4,r22 + shlld r3,r22,r6 + shlri r6,49,r5 + movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ + sub r21,r5,r1 + mmulfx.w r1,r1,r4 + mshflo.w r1,r63,r1 + sub r63,r22,r20 // r63 == 64 % 64 + mmulfx.w r5,r4,r4 + pta LOCAL(large_divisor),tr0 + addi r20,32,r9 + msub.w r1,r4,r1 + madd.w r1,r1,r1 + mmulfx.w r1,r1,r4 + shlri r6,32,r7 + bgt/u r9,r63,tr0 // large_divisor + mmulfx.w r5,r4,r4 + shlri r2,32+14,r19 + addi r22,-31,r0 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r19,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + mulu.l r5,r3,r5 + mshalds.l r1,r21,r1 + shari r4,26,r4 + shlld r5,r0,r5 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r2,r5,r2 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ + + shlri r2,22,r21 + mulu.l r21,r1,r21 + addi r20,30-22,r0 + /* bubble */ /* could test r3 here to check for divide by zero. */ + shlrd r21,r0,r21 + mulu.l r21,r3,r5 + mcmpgt.l r21,r63,r21 // See Note 1 + addi r20,30,r0 + mshfhi.l r63,r21,r21 + sub r2,r5,r2 + andc r2,r21,r2 + + /* small divisor: need a third divide step */ + mulu.l r2,r1,r7 + ptabs r18,tr0 + sub r2,r3,r8 /* re-use r8 here for rest - r3 */ + shlrd r7,r0,r7 + mulu.l r7,r3,r5 + /* bubble */ + addi r8,1,r7 + cmpgt r7,r5,r7 + cmvne r7,r8,r2 + sub r2,r5,r2 + blink tr0,r63 + +LOCAL(large_divisor): + mmulfx.w r5,r4,r4 + shlrd r2,r9,r25 + shlri r25,32,r8 + msub.w r1,r4,r1 + + mulu.l r1,r7,r4 + addi r1,-3,r5 + mulu.l r5,r8,r5 + sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 + shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as + the case may be, %0000000000000000 000.11111111111, still */ + muls.l r1,r4,r4 /* leaving at least one sign bit. */ + shlri r5,14-1,r8 + mulu.l r8,r7,r5 + mshalds.l r1,r21,r1 + shari r4,26,r4 + add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) + sub r25,r5,r25 + /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ + + shlri r25,22,r21 + mulu.l r21,r1,r21 + pta LOCAL(no_lo_adj),tr0 + addi r22,32,r0 + shlri r21,40,r21 + mulu.l r21,r7,r5 + add r8,r21,r8 + shlld r2,r0,r2 + sub r25,r5,r25 + bgtu/u r7,r25,tr0 // no_lo_adj + addi r8,1,r8 + sub r25,r7,r25 +LOCAL(no_lo_adj): + mextr4 r2,r25,r2 + + /* large_divisor: only needs a few adjustments. */ + mulu.l r8,r6,r5 + ptabs r18,tr0 + add r2,r6,r7 + cmpgtu r5,r2,r8 + cmvne r8,r7,r2 + sub r2,r5,r2 + shlrd r2,r22,r2 + blink tr0,r63 + ENDFUNC(GLOBAL(umoddi3)) +/* Note 1: To shift the result of the second divide stage so that the result + always fits into 32 bits, yet we still reduce the rest sufficiently + would require a lot of instructions to do the shifts just right. Using + the full 64 bit shift result to multiply with the divisor would require + four extra instructions for the upper 32 bits (shift / mulu / shift / sub). + Fortunately, if the upper 32 bits of the shift result are nonzero, we + know that the rest after taking this partial result into account will + fit into 32 bits. So we just clear the upper 32 bits of the rest if the + upper 32 bits of the partial result are nonzero. */ +#endif /* __SHMEDIA__ */ +#endif /* L_umoddi3 */ + +#ifdef L_moddi3 +#ifdef __SHMEDIA__ + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(moddi3) + FUNC(GLOBAL(moddi3)) +GLOBAL(moddi3): + pta GLOBAL(umoddi3_internal),tr0 + shari r2,63,r22 + shari r3,63,r23 + xor r2,r22,r2 + xor r3,r23,r3 + sub r2,r22,r2 + sub r3,r23,r3 + beq/u r22,r63,tr0 + ptabs r18,tr1 + blink tr0,r18 + sub r63,r2,r2 + blink tr1,r63 + ENDFUNC(GLOBAL(moddi3)) +#endif /* __SHMEDIA__ */ +#endif /* L_moddi3 */ + +#ifdef L_set_fpscr +#if !defined (__SH2A_NOFPU__) +#if defined (__SH2E__) || defined (__SH2A__) || defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32 +#ifdef __SH5__ + .mode SHcompact +#endif + .global GLOBAL(set_fpscr) + HIDDEN_FUNC(GLOBAL(set_fpscr)) +GLOBAL(set_fpscr): + lds r4,fpscr +#ifdef __PIC__ + mov.l r12,@-r15 +#ifdef __vxworks + mov.l LOCAL(set_fpscr_L0_base),r12 + mov.l LOCAL(set_fpscr_L0_index),r0 + mov.l @r12,r12 + mov.l @(r0,r12),r12 +#else + mova LOCAL(set_fpscr_L0),r0 + mov.l LOCAL(set_fpscr_L0),r12 + add r0,r12 +#endif + mov.l LOCAL(set_fpscr_L1),r0 + mov.l @(r0,r12),r1 + mov.l @r15+,r12 +#else + mov.l LOCAL(set_fpscr_L1),r1 +#endif + swap.w r4,r0 + or #24,r0 +#ifndef FMOVD_WORKS + xor #16,r0 +#endif +#if defined(__SH4__) || defined (__SH2A_DOUBLE__) + swap.w r0,r3 + mov.l r3,@(4,r1) +#else /* defined (__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */ + swap.w r0,r2 + mov.l r2,@r1 +#endif +#ifndef FMOVD_WORKS + xor #8,r0 +#else + xor #24,r0 +#endif +#if defined(__SH4__) || defined (__SH2A_DOUBLE__) + swap.w r0,r2 + rts + mov.l r2,@r1 +#else /* defined(__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */ + swap.w r0,r3 + rts + mov.l r3,@(4,r1) +#endif + .align 2 +#ifdef __PIC__ +#ifdef __vxworks +LOCAL(set_fpscr_L0_base): + .long ___GOTT_BASE__ +LOCAL(set_fpscr_L0_index): + .long ___GOTT_INDEX__ +#else +LOCAL(set_fpscr_L0): + .long _GLOBAL_OFFSET_TABLE_ +#endif +LOCAL(set_fpscr_L1): + .long GLOBAL(fpscr_values@GOT) +#else +LOCAL(set_fpscr_L1): + .long GLOBAL(fpscr_values) +#endif + + ENDFUNC(GLOBAL(set_fpscr)) +#ifndef NO_FPSCR_VALUES +#ifdef __ELF__ + .comm GLOBAL(fpscr_values),8,4 +#else + .comm GLOBAL(fpscr_values),8 +#endif /* ELF */ +#endif /* NO_FPSCR_VALUES */ +#endif /* SH2E / SH3E / SH4 */ +#endif /* __SH2A_NOFPU__ */ +#endif /* L_set_fpscr */ +#ifdef L_ic_invalidate +#if __SH5__ == 32 + .mode SHmedia + .section .text..SHmedia32,"ax" + .align 2 + .global GLOBAL(init_trampoline) + HIDDEN_FUNC(GLOBAL(init_trampoline)) +GLOBAL(init_trampoline): + st.l r0,8,r2 +#ifdef __LITTLE_ENDIAN__ + movi 9,r20 + shori 0x402b,r20 + shori 0xd101,r20 + shori 0xd002,r20 +#else + movi 0xffffffffffffd002,r20 + shori 0xd101,r20 + shori 0x402b,r20 + shori 9,r20 +#endif + st.q r0,0,r20 + st.l r0,12,r3 + ENDFUNC(GLOBAL(init_trampoline)) + .global GLOBAL(ic_invalidate) + HIDDEN_FUNC(GLOBAL(ic_invalidate)) +GLOBAL(ic_invalidate): + ocbwb r0,0 + synco + icbi r0, 0 + ptabs r18, tr0 + synci + blink tr0, r63 + ENDFUNC(GLOBAL(ic_invalidate)) +#elif defined(__SH4A__) + .global GLOBAL(ic_invalidate) + HIDDEN_FUNC(GLOBAL(ic_invalidate)) +GLOBAL(ic_invalidate): + ocbwb @r4 + synco + icbi @r4 + rts + nop + ENDFUNC(GLOBAL(ic_invalidate)) +#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__)) + /* For system code, we use ic_invalidate_line_i, but user code + needs a different mechanism. A kernel call is generally not + available, and it would also be slow. Different SH4 variants use + different sizes and associativities of the Icache. We use a small + bit of dispatch code that can be put hidden in every shared object, + which calls the actual processor-specific invalidation code in a + separate module. + Or if you have operating system support, the OS could mmap the + procesor-specific code from a single page, since it is highly + repetitive. */ + .global GLOBAL(ic_invalidate) + HIDDEN_FUNC(GLOBAL(ic_invalidate)) +GLOBAL(ic_invalidate): +#ifdef __pic__ +#ifdef __vxworks + mov.l 1f,r1 + mov.l 2f,r0 + mov.l @r1,r1 + mov.l 0f,r2 + mov.l @(r0,r1),r0 +#else + mov.l 1f,r1 + mova 1f,r0 + mov.l 0f,r2 + add r1,r0 +#endif + mov.l @(r0,r2),r1 +#else + mov.l 0f,r1 +#endif + ocbwb @r4 + mov.l @(8,r1),r0 + sub r1,r4 + and r4,r0 + add r1,r0 + jmp @r0 + mov.l @(4,r1),r0 + .align 2 +#ifndef __pic__ +0: .long GLOBAL(ic_invalidate_array) +#else /* __pic__ */ + .global GLOBAL(ic_invalidate_array) +0: .long GLOBAL(ic_invalidate_array)@GOT +#ifdef __vxworks +1: .long ___GOTT_BASE__ +2: .long ___GOTT_INDEX__ +#else +1: .long _GLOBAL_OFFSET_TABLE_ +#endif + ENDFUNC(GLOBAL(ic_invalidate)) +#endif /* __pic__ */ +#endif /* SH4 */ +#endif /* L_ic_invalidate */ + +#ifdef L_ic_invalidate_array +#if defined(__SH4A__) || (defined (__FORCE_SH4A__) && (defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__)))) + .global GLOBAL(ic_invalidate_array) + /* This is needed when an SH4 dso with trampolines is used on SH4A. */ + .global GLOBAL(ic_invalidate_array) + FUNC(GLOBAL(ic_invalidate_array)) +GLOBAL(ic_invalidate_array): + add r1,r4 + synco + icbi @r4 + rts + nop + .align 2 + .long 0 + ENDFUNC(GLOBAL(ic_invalidate_array)) +#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__)) + .global GLOBAL(ic_invalidate_array) + .p2align 5 + FUNC(GLOBAL(ic_invalidate_array)) +/* This must be aligned to the beginning of a cache line. */ +GLOBAL(ic_invalidate_array): +#ifndef WAYS +#define WAYS 4 +#define WAY_SIZE 0x4000 +#endif +#if WAYS == 1 + .rept WAY_SIZE * WAYS / 32 + rts + nop + .rept 7 + .long WAY_SIZE - 32 + .endr + .endr +#elif WAYS <= 6 + .rept WAY_SIZE * WAYS / 32 + braf r0 + add #-8,r0 + .long WAY_SIZE + 8 + .long WAY_SIZE - 32 + .rept WAYS-2 + braf r0 + nop + .endr + .rept 7 - WAYS + rts + nop + .endr + .endr +#else /* WAYS > 6 */ + /* This variant needs two different pages for mmap-ing. */ + .rept WAYS-1 + .rept WAY_SIZE / 32 + braf r0 + nop + .long WAY_SIZE + .rept 6 + .long WAY_SIZE - 32 + .endr + .endr + .endr + .rept WAY_SIZE / 32 + rts + .rept 15 + nop + .endr + .endr +#endif /* WAYS */ + ENDFUNC(GLOBAL(ic_invalidate_array)) +#endif /* SH4 */ +#endif /* L_ic_invalidate_array */ + +#if defined (__SH5__) && __SH5__ == 32 +#ifdef L_shcompact_call_trampoline + .section .rodata + .align 1 +LOCAL(ct_main_table): +.word LOCAL(ct_r2_fp) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r2_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r2_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r3_fp) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r3_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r3_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r4_fp) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r4_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r4_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r5_fp) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r5_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r5_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r6_fph) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r6_fpl) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r6_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r6_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r7_fph) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r7_fpl) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r7_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r7_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r8_fph) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r8_fpl) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r8_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r8_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_fph) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_fpl) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_ld) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_ret_wide) - datalabel LOCAL(ct_main_label) +.word LOCAL(ct_call_func) - datalabel LOCAL(ct_main_label) + .mode SHmedia + .section .text..SHmedia32, "ax" + .align 2 + + /* This function loads 64-bit general-purpose registers from the + stack, from a memory address contained in them or from an FP + register, according to a cookie passed in r1. Its execution + time is linear on the number of registers that actually have + to be copied. See sh.h for details on the actual bit pattern. + + The function to be called is passed in r0. If a 32-bit return + value is expected, the actual function will be tail-called, + otherwise the return address will be stored in r10 (that the + caller should expect to be clobbered) and the return value + will be expanded into r2/r3 upon return. */ + + .global GLOBAL(GCC_shcompact_call_trampoline) + FUNC(GLOBAL(GCC_shcompact_call_trampoline)) +GLOBAL(GCC_shcompact_call_trampoline): + ptabs/l r0, tr0 /* Prepare to call the actual function. */ + movi ((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0 + pt/l LOCAL(ct_loop), tr1 + addz.l r1, r63, r1 + shori ((datalabel LOCAL(ct_main_table) - 31 * 2)) & 65535, r0 +LOCAL(ct_loop): + nsb r1, r28 + shlli r28, 1, r29 + ldx.w r0, r29, r30 +LOCAL(ct_main_label): + ptrel/l r30, tr2 + blink tr2, r63 +LOCAL(ct_r2_fp): /* Copy r2 from an FP register. */ + /* It must be dr0, so just do it. */ + fmov.dq dr0, r2 + movi 7, r30 + shlli r30, 29, r31 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r3_fp): /* Copy r3 from an FP register. */ + /* It is either dr0 or dr2. */ + movi 7, r30 + shlri r1, 26, r32 + shlli r30, 26, r31 + andc r1, r31, r1 + fmov.dq dr0, r3 + beqi/l r32, 4, tr1 + fmov.dq dr2, r3 + blink tr1, r63 +LOCAL(ct_r4_fp): /* Copy r4 from an FP register. */ + shlri r1, 23 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r4_fp_copy) - datalabel LOCAL(ct_r4_fp_base), r32 +LOCAL(ct_r4_fp_base): + ptrel/l r32, tr2 + movi 7, r30 + shlli r30, 23, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r4_fp_copy): + fmov.dq dr0, r4 + blink tr1, r63 + fmov.dq dr2, r4 + blink tr1, r63 + fmov.dq dr4, r4 + blink tr1, r63 +LOCAL(ct_r5_fp): /* Copy r5 from an FP register. */ + shlri r1, 20 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r5_fp_copy) - datalabel LOCAL(ct_r5_fp_base), r32 +LOCAL(ct_r5_fp_base): + ptrel/l r32, tr2 + movi 7, r30 + shlli r30, 20, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r5_fp_copy): + fmov.dq dr0, r5 + blink tr1, r63 + fmov.dq dr2, r5 + blink tr1, r63 + fmov.dq dr4, r5 + blink tr1, r63 + fmov.dq dr6, r5 + blink tr1, r63 +LOCAL(ct_r6_fph): /* Copy r6 from a high FP register. */ + /* It must be dr8. */ + fmov.dq dr8, r6 + movi 15, r30 + shlli r30, 16, r31 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r6_fpl): /* Copy r6 from a low FP register. */ + shlri r1, 16 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r6_fp_copy) - datalabel LOCAL(ct_r6_fp_base), r32 +LOCAL(ct_r6_fp_base): + ptrel/l r32, tr2 + movi 7, r30 + shlli r30, 16, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r6_fp_copy): + fmov.dq dr0, r6 + blink tr1, r63 + fmov.dq dr2, r6 + blink tr1, r63 + fmov.dq dr4, r6 + blink tr1, r63 + fmov.dq dr6, r6 + blink tr1, r63 +LOCAL(ct_r7_fph): /* Copy r7 from a high FP register. */ + /* It is either dr8 or dr10. */ + movi 15 << 12, r31 + shlri r1, 12, r32 + andc r1, r31, r1 + fmov.dq dr8, r7 + beqi/l r32, 8, tr1 + fmov.dq dr10, r7 + blink tr1, r63 +LOCAL(ct_r7_fpl): /* Copy r7 from a low FP register. */ + shlri r1, 12 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r7_fp_copy) - datalabel LOCAL(ct_r7_fp_base), r32 +LOCAL(ct_r7_fp_base): + ptrel/l r32, tr2 + movi 7 << 12, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r7_fp_copy): + fmov.dq dr0, r7 + blink tr1, r63 + fmov.dq dr2, r7 + blink tr1, r63 + fmov.dq dr4, r7 + blink tr1, r63 + fmov.dq dr6, r7 + blink tr1, r63 +LOCAL(ct_r8_fph): /* Copy r8 from a high FP register. */ + /* It is either dr8 or dr10. */ + movi 15 << 8, r31 + andi r1, 1 << 8, r32 + andc r1, r31, r1 + fmov.dq dr8, r8 + beq/l r32, r63, tr1 + fmov.dq dr10, r8 + blink tr1, r63 +LOCAL(ct_r8_fpl): /* Copy r8 from a low FP register. */ + shlri r1, 8 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r8_fp_copy) - datalabel LOCAL(ct_r8_fp_base), r32 +LOCAL(ct_r8_fp_base): + ptrel/l r32, tr2 + movi 7 << 8, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r8_fp_copy): + fmov.dq dr0, r8 + blink tr1, r63 + fmov.dq dr2, r8 + blink tr1, r63 + fmov.dq dr4, r8 + blink tr1, r63 + fmov.dq dr6, r8 + blink tr1, r63 +LOCAL(ct_r9_fph): /* Copy r9 from a high FP register. */ + /* It is either dr8 or dr10. */ + movi 15 << 4, r31 + andi r1, 1 << 4, r32 + andc r1, r31, r1 + fmov.dq dr8, r9 + beq/l r32, r63, tr1 + fmov.dq dr10, r9 + blink tr1, r63 +LOCAL(ct_r9_fpl): /* Copy r9 from a low FP register. */ + shlri r1, 4 - 3, r34 + andi r34, 3 << 3, r33 + addi r33, LOCAL(ct_r9_fp_copy) - datalabel LOCAL(ct_r9_fp_base), r32 +LOCAL(ct_r9_fp_base): + ptrel/l r32, tr2 + movi 7 << 4, r31 + andc r1, r31, r1 + blink tr2, r63 +LOCAL(ct_r9_fp_copy): + fmov.dq dr0, r9 + blink tr1, r63 + fmov.dq dr2, r9 + blink tr1, r63 + fmov.dq dr4, r9 + blink tr1, r63 + fmov.dq dr6, r9 + blink tr1, r63 +LOCAL(ct_r2_ld): /* Copy r2 from a memory address. */ + pt/l LOCAL(ct_r2_load), tr2 + movi 3, r30 + shlli r30, 29, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r2, 8, r3 + ldx.q r2, r63, r2 + /* Fall through. */ +LOCAL(ct_r3_ld): /* Copy r3 from a memory address. */ + pt/l LOCAL(ct_r3_load), tr2 + movi 3, r30 + shlli r30, 26, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r3, 8, r4 + ldx.q r3, r63, r3 +LOCAL(ct_r4_ld): /* Copy r4 from a memory address. */ + pt/l LOCAL(ct_r4_load), tr2 + movi 3, r30 + shlli r30, 23, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r4, 8, r5 + ldx.q r4, r63, r4 +LOCAL(ct_r5_ld): /* Copy r5 from a memory address. */ + pt/l LOCAL(ct_r5_load), tr2 + movi 3, r30 + shlli r30, 20, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r5, 8, r6 + ldx.q r5, r63, r5 +LOCAL(ct_r6_ld): /* Copy r6 from a memory address. */ + pt/l LOCAL(ct_r6_load), tr2 + movi 3 << 16, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r6, 8, r7 + ldx.q r6, r63, r6 +LOCAL(ct_r7_ld): /* Copy r7 from a memory address. */ + pt/l LOCAL(ct_r7_load), tr2 + movi 3 << 12, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r7, 8, r8 + ldx.q r7, r63, r7 +LOCAL(ct_r8_ld): /* Copy r8 from a memory address. */ + pt/l LOCAL(ct_r8_load), tr2 + movi 3 << 8, r31 + and r1, r31, r32 + andc r1, r31, r1 + beq/l r31, r32, tr2 + addi.l r8, 8, r9 + ldx.q r8, r63, r8 +LOCAL(ct_r9_ld): /* Copy r9 from a memory address. */ + pt/l LOCAL(ct_check_tramp), tr2 + ldx.q r9, r63, r9 + blink tr2, r63 +LOCAL(ct_r2_load): + ldx.q r2, r63, r2 + blink tr1, r63 +LOCAL(ct_r3_load): + ldx.q r3, r63, r3 + blink tr1, r63 +LOCAL(ct_r4_load): + ldx.q r4, r63, r4 + blink tr1, r63 +LOCAL(ct_r5_load): + ldx.q r5, r63, r5 + blink tr1, r63 +LOCAL(ct_r6_load): + ldx.q r6, r63, r6 + blink tr1, r63 +LOCAL(ct_r7_load): + ldx.q r7, r63, r7 + blink tr1, r63 +LOCAL(ct_r8_load): + ldx.q r8, r63, r8 + blink tr1, r63 +LOCAL(ct_r2_pop): /* Pop r2 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r2 + shlli r30, 29, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r3_pop): /* Pop r3 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r3 + shlli r30, 26, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r4_pop): /* Pop r4 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r4 + shlli r30, 23, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r5_pop): /* Pop r5 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r5 + shlli r30, 20, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r6_pop): /* Pop r6 from the stack. */ + movi 1, r30 + ldx.q r15, r63, r6 + shlli r30, 16, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r7_pop): /* Pop r7 from the stack. */ + ldx.q r15, r63, r7 + movi 1 << 12, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_r8_pop): /* Pop r8 from the stack. */ + ldx.q r15, r63, r8 + movi 1 << 8, r31 + addi.l r15, 8, r15 + andc r1, r31, r1 + blink tr1, r63 +LOCAL(ct_pop_seq): /* Pop a sequence of registers off the stack. */ + andi r1, 7 << 1, r30 + movi (LOCAL(ct_end_of_pop_seq) >> 16) & 65535, r32 + shlli r30, 2, r31 + shori LOCAL(ct_end_of_pop_seq) & 65535, r32 + sub.l r32, r31, r33 + ptabs/l r33, tr2 + blink tr2, r63 +LOCAL(ct_start_of_pop_seq): /* Beginning of pop sequence. */ + ldx.q r15, r63, r3 + addi.l r15, 8, r15 + ldx.q r15, r63, r4 + addi.l r15, 8, r15 + ldx.q r15, r63, r5 + addi.l r15, 8, r15 + ldx.q r15, r63, r6 + addi.l r15, 8, r15 + ldx.q r15, r63, r7 + addi.l r15, 8, r15 + ldx.q r15, r63, r8 + addi.l r15, 8, r15 +LOCAL(ct_r9_pop): /* Pop r9 from the stack. */ + ldx.q r15, r63, r9 + addi.l r15, 8, r15 +LOCAL(ct_end_of_pop_seq): /* Label used to compute first pop instruction. */ +LOCAL(ct_check_tramp): /* Check whether we need a trampoline. */ + pt/u LOCAL(ct_ret_wide), tr2 + andi r1, 1, r1 + bne/u r1, r63, tr2 +LOCAL(ct_call_func): /* Just branch to the function. */ + blink tr0, r63 +LOCAL(ct_ret_wide): /* Call the function, so that we can unpack its + 64-bit return value. */ + add.l r18, r63, r10 + blink tr0, r18 + ptabs r10, tr0 +#if __LITTLE_ENDIAN__ + shari r2, 32, r3 + add.l r2, r63, r2 +#else + add.l r2, r63, r3 + shari r2, 32, r2 +#endif + blink tr0, r63 + + ENDFUNC(GLOBAL(GCC_shcompact_call_trampoline)) +#endif /* L_shcompact_call_trampoline */ + +#ifdef L_shcompact_return_trampoline + /* This function does the converse of the code in `ret_wide' + above. It is tail-called by SHcompact functions returning + 64-bit non-floating-point values, to pack the 32-bit values in + r2 and r3 into r2. */ + + .mode SHmedia + .section .text..SHmedia32, "ax" + .align 2 + .global GLOBAL(GCC_shcompact_return_trampoline) + HIDDEN_FUNC(GLOBAL(GCC_shcompact_return_trampoline)) +GLOBAL(GCC_shcompact_return_trampoline): + ptabs/l r18, tr0 +#if __LITTLE_ENDIAN__ + addz.l r2, r63, r2 + shlli r3, 32, r3 +#else + addz.l r3, r63, r3 + shlli r2, 32, r2 +#endif + or r3, r2, r2 + blink tr0, r63 + + ENDFUNC(GLOBAL(GCC_shcompact_return_trampoline)) +#endif /* L_shcompact_return_trampoline */ + +#ifdef L_shcompact_incoming_args + .section .rodata + .align 1 +LOCAL(ia_main_table): +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r2_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r2_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r3_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r3_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r4_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r4_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r5_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r5_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r6_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r6_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r7_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r7_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r8_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r8_push) - datalabel LOCAL(ia_main_label) +.word 1 /* Invalid, just loop */ +.word 1 /* Invalid, just loop */ +.word LOCAL(ia_r9_ld) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_return) - datalabel LOCAL(ia_main_label) +.word LOCAL(ia_return) - datalabel LOCAL(ia_main_label) + .mode SHmedia + .section .text..SHmedia32, "ax" + .align 2 + + /* This function stores 64-bit general-purpose registers back in + the stack, and loads the address in which each register + was stored into itself. The lower 32 bits of r17 hold the address + to begin storing, and the upper 32 bits of r17 hold the cookie. + Its execution time is linear on the + number of registers that actually have to be copied, and it is + optimized for structures larger than 64 bits, as opposed to + individual `long long' arguments. See sh.h for details on the + actual bit pattern. */ + + .global GLOBAL(GCC_shcompact_incoming_args) + FUNC(GLOBAL(GCC_shcompact_incoming_args)) +GLOBAL(GCC_shcompact_incoming_args): + ptabs/l r18, tr0 /* Prepare to return. */ + shlri r17, 32, r0 /* Load the cookie. */ + movi ((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r43 + pt/l LOCAL(ia_loop), tr1 + add.l r17, r63, r17 + shori ((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r43 +LOCAL(ia_loop): + nsb r0, r36 + shlli r36, 1, r37 + ldx.w r43, r37, r38 +LOCAL(ia_main_label): + ptrel/l r38, tr2 + blink tr2, r63 +LOCAL(ia_r2_ld): /* Store r2 and load its address. */ + movi 3, r38 + shlli r38, 29, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r2 + add.l r17, r63, r2 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r3_ld): /* Store r3 and load its address. */ + movi 3, r38 + shlli r38, 26, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r3 + add.l r17, r63, r3 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r4_ld): /* Store r4 and load its address. */ + movi 3, r38 + shlli r38, 23, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r4 + add.l r17, r63, r4 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r5_ld): /* Store r5 and load its address. */ + movi 3, r38 + shlli r38, 20, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r5 + add.l r17, r63, r5 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r6_ld): /* Store r6 and load its address. */ + movi 3, r38 + shlli r38, 16, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r6 + add.l r17, r63, r6 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r7_ld): /* Store r7 and load its address. */ + movi 3 << 12, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r7 + add.l r17, r63, r7 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r8_ld): /* Store r8 and load its address. */ + movi 3 << 8, r39 + and r0, r39, r40 + andc r0, r39, r0 + stx.q r17, r63, r8 + add.l r17, r63, r8 + addi.l r17, 8, r17 + beq/u r39, r40, tr1 +LOCAL(ia_r9_ld): /* Store r9 and load its address. */ + stx.q r17, r63, r9 + add.l r17, r63, r9 + blink tr0, r63 +LOCAL(ia_r2_push): /* Push r2 onto the stack. */ + movi 1, r38 + shlli r38, 29, r39 + andc r0, r39, r0 + stx.q r17, r63, r2 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r3_push): /* Push r3 onto the stack. */ + movi 1, r38 + shlli r38, 26, r39 + andc r0, r39, r0 + stx.q r17, r63, r3 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r4_push): /* Push r4 onto the stack. */ + movi 1, r38 + shlli r38, 23, r39 + andc r0, r39, r0 + stx.q r17, r63, r4 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r5_push): /* Push r5 onto the stack. */ + movi 1, r38 + shlli r38, 20, r39 + andc r0, r39, r0 + stx.q r17, r63, r5 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r6_push): /* Push r6 onto the stack. */ + movi 1, r38 + shlli r38, 16, r39 + andc r0, r39, r0 + stx.q r17, r63, r6 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r7_push): /* Push r7 onto the stack. */ + movi 1 << 12, r39 + andc r0, r39, r0 + stx.q r17, r63, r7 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_r8_push): /* Push r8 onto the stack. */ + movi 1 << 8, r39 + andc r0, r39, r0 + stx.q r17, r63, r8 + addi.l r17, 8, r17 + blink tr1, r63 +LOCAL(ia_push_seq): /* Push a sequence of registers onto the stack. */ + andi r0, 7 << 1, r38 + movi (LOCAL(ia_end_of_push_seq) >> 16) & 65535, r40 + shlli r38, 2, r39 + shori LOCAL(ia_end_of_push_seq) & 65535, r40 + sub.l r40, r39, r41 + ptabs/l r41, tr2 + blink tr2, r63 +LOCAL(ia_stack_of_push_seq): /* Beginning of push sequence. */ + stx.q r17, r63, r3 + addi.l r17, 8, r17 + stx.q r17, r63, r4 + addi.l r17, 8, r17 + stx.q r17, r63, r5 + addi.l r17, 8, r17 + stx.q r17, r63, r6 + addi.l r17, 8, r17 + stx.q r17, r63, r7 + addi.l r17, 8, r17 + stx.q r17, r63, r8 + addi.l r17, 8, r17 +LOCAL(ia_r9_push): /* Push r9 onto the stack. */ + stx.q r17, r63, r9 +LOCAL(ia_return): /* Return. */ + blink tr0, r63 +LOCAL(ia_end_of_push_seq): /* Label used to compute the first push instruction. */ + ENDFUNC(GLOBAL(GCC_shcompact_incoming_args)) +#endif /* L_shcompact_incoming_args */ +#endif +#if __SH5__ +#ifdef L_nested_trampoline +#if __SH5__ == 32 + .section .text..SHmedia32,"ax" +#else + .text +#endif + .align 3 /* It is copied in units of 8 bytes in SHmedia mode. */ + .global GLOBAL(GCC_nested_trampoline) + HIDDEN_FUNC(GLOBAL(GCC_nested_trampoline)) +GLOBAL(GCC_nested_trampoline): + .mode SHmedia + ptrel/u r63, tr0 + gettr tr0, r0 +#if __SH5__ == 64 + ld.q r0, 24, r1 +#else + ld.l r0, 24, r1 +#endif + ptabs/l r1, tr1 +#if __SH5__ == 64 + ld.q r0, 32, r1 +#else + ld.l r0, 28, r1 +#endif + blink tr1, r63 + + ENDFUNC(GLOBAL(GCC_nested_trampoline)) +#endif /* L_nested_trampoline */ +#endif /* __SH5__ */ +#if __SH5__ == 32 +#ifdef L_push_pop_shmedia_regs + .section .text..SHmedia32,"ax" + .mode SHmedia + .align 2 +#ifndef __SH4_NOFPU__ + .global GLOBAL(GCC_push_shmedia_regs) + FUNC(GLOBAL(GCC_push_shmedia_regs)) +GLOBAL(GCC_push_shmedia_regs): + addi.l r15, -14*8, r15 + fst.d r15, 13*8, dr62 + fst.d r15, 12*8, dr60 + fst.d r15, 11*8, dr58 + fst.d r15, 10*8, dr56 + fst.d r15, 9*8, dr54 + fst.d r15, 8*8, dr52 + fst.d r15, 7*8, dr50 + fst.d r15, 6*8, dr48 + fst.d r15, 5*8, dr46 + fst.d r15, 4*8, dr44 + fst.d r15, 3*8, dr42 + fst.d r15, 2*8, dr40 + fst.d r15, 1*8, dr38 + fst.d r15, 0*8, dr36 +#else /* ! __SH4_NOFPU__ */ + .global GLOBAL(GCC_push_shmedia_regs_nofpu) + FUNC(GLOBAL(GCC_push_shmedia_regs_nofpu)) +GLOBAL(GCC_push_shmedia_regs_nofpu): +#endif /* ! __SH4_NOFPU__ */ + ptabs/l r18, tr0 + addi.l r15, -27*8, r15 + gettr tr7, r62 + gettr tr6, r61 + gettr tr5, r60 + st.q r15, 26*8, r62 + st.q r15, 25*8, r61 + st.q r15, 24*8, r60 + st.q r15, 23*8, r59 + st.q r15, 22*8, r58 + st.q r15, 21*8, r57 + st.q r15, 20*8, r56 + st.q r15, 19*8, r55 + st.q r15, 18*8, r54 + st.q r15, 17*8, r53 + st.q r15, 16*8, r52 + st.q r15, 15*8, r51 + st.q r15, 14*8, r50 + st.q r15, 13*8, r49 + st.q r15, 12*8, r48 + st.q r15, 11*8, r47 + st.q r15, 10*8, r46 + st.q r15, 9*8, r45 + st.q r15, 8*8, r44 + st.q r15, 7*8, r35 + st.q r15, 6*8, r34 + st.q r15, 5*8, r33 + st.q r15, 4*8, r32 + st.q r15, 3*8, r31 + st.q r15, 2*8, r30 + st.q r15, 1*8, r29 + st.q r15, 0*8, r28 + blink tr0, r63 +#ifndef __SH4_NOFPU__ + ENDFUNC(GLOBAL(GCC_push_shmedia_regs)) +#else + ENDFUNC(GLOBAL(GCC_push_shmedia_regs_nofpu)) +#endif +#ifndef __SH4_NOFPU__ + .global GLOBAL(GCC_pop_shmedia_regs) + FUNC(GLOBAL(GCC_pop_shmedia_regs)) +GLOBAL(GCC_pop_shmedia_regs): + pt .L0, tr1 + movi 41*8, r0 + fld.d r15, 40*8, dr62 + fld.d r15, 39*8, dr60 + fld.d r15, 38*8, dr58 + fld.d r15, 37*8, dr56 + fld.d r15, 36*8, dr54 + fld.d r15, 35*8, dr52 + fld.d r15, 34*8, dr50 + fld.d r15, 33*8, dr48 + fld.d r15, 32*8, dr46 + fld.d r15, 31*8, dr44 + fld.d r15, 30*8, dr42 + fld.d r15, 29*8, dr40 + fld.d r15, 28*8, dr38 + fld.d r15, 27*8, dr36 + blink tr1, r63 +#else /* ! __SH4_NOFPU__ */ + .global GLOBAL(GCC_pop_shmedia_regs_nofpu) + FUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu)) +GLOBAL(GCC_pop_shmedia_regs_nofpu): +#endif /* ! __SH4_NOFPU__ */ + movi 27*8, r0 +.L0: + ptabs r18, tr0 + ld.q r15, 26*8, r62 + ld.q r15, 25*8, r61 + ld.q r15, 24*8, r60 + ptabs r62, tr7 + ptabs r61, tr6 + ptabs r60, tr5 + ld.q r15, 23*8, r59 + ld.q r15, 22*8, r58 + ld.q r15, 21*8, r57 + ld.q r15, 20*8, r56 + ld.q r15, 19*8, r55 + ld.q r15, 18*8, r54 + ld.q r15, 17*8, r53 + ld.q r15, 16*8, r52 + ld.q r15, 15*8, r51 + ld.q r15, 14*8, r50 + ld.q r15, 13*8, r49 + ld.q r15, 12*8, r48 + ld.q r15, 11*8, r47 + ld.q r15, 10*8, r46 + ld.q r15, 9*8, r45 + ld.q r15, 8*8, r44 + ld.q r15, 7*8, r35 + ld.q r15, 6*8, r34 + ld.q r15, 5*8, r33 + ld.q r15, 4*8, r32 + ld.q r15, 3*8, r31 + ld.q r15, 2*8, r30 + ld.q r15, 1*8, r29 + ld.q r15, 0*8, r28 + add.l r15, r0, r15 + blink tr0, r63 + +#ifndef __SH4_NOFPU__ + ENDFUNC(GLOBAL(GCC_pop_shmedia_regs)) +#else + ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu)) +#endif +#endif /* __SH5__ == 32 */ +#endif /* L_push_pop_shmedia_regs */ + +#ifdef L_div_table +#if __SH5__ +#if defined(__pic__) && defined(__SHMEDIA__) + .global GLOBAL(sdivsi3) + FUNC(GLOBAL(sdivsi3)) +#if __SH5__ == 32 + .section .text..SHmedia32,"ax" +#else + .text +#endif +#if 0 +/* ??? FIXME: Presumably due to a linker bug, exporting data symbols + in a text section does not work (at least for shared libraries): + the linker sets the LSB of the address as if this was SHmedia code. */ +#define TEXT_DATA_BUG +#endif + .align 2 + // inputs: r4,r5 + // clobbered: r1,r18,r19,r20,r21,r25,tr0 + // result in r0 + .global GLOBAL(sdivsi3) +GLOBAL(sdivsi3): +#ifdef TEXT_DATA_BUG + ptb datalabel Local_div_table,tr0 +#else + ptb GLOBAL(div_table_internal),tr0 +#endif + nsb r5, r1 + shlld r5, r1, r25 // normalize; [-2 ..1, 1..2) in s2.62 + shari r25, 58, r21 // extract 5(6) bit index (s2.4 with hole -1..1) + /* bubble */ + gettr tr0,r20 + ldx.ub r20, r21, r19 // u0.8 + shari r25, 32, r25 // normalize to s2.30 + shlli r21, 1, r21 + muls.l r25, r19, r19 // s2.38 + ldx.w r20, r21, r21 // s2.14 + ptabs r18, tr0 + shari r19, 24, r19 // truncate to s2.14 + sub r21, r19, r19 // some 11 bit inverse in s1.14 + muls.l r19, r19, r21 // u0.28 + sub r63, r1, r1 + addi r1, 92, r1 + muls.l r25, r21, r18 // s2.58 + shlli r19, 45, r19 // multiply by two and convert to s2.58 + /* bubble */ + sub r19, r18, r18 + shari r18, 28, r18 // some 22 bit inverse in s1.30 + muls.l r18, r25, r0 // s2.60 + muls.l r18, r4, r25 // s32.30 + /* bubble */ + shari r0, 16, r19 // s-16.44 + muls.l r19, r18, r19 // s-16.74 + shari r25, 63, r0 + shari r4, 14, r18 // s19.-14 + shari r19, 30, r19 // s-16.44 + muls.l r19, r18, r19 // s15.30 + xor r21, r0, r21 // You could also use the constant 1 << 27. + add r21, r25, r21 + sub r21, r19, r21 + shard r21, r1, r21 + sub r21, r0, r0 + blink tr0, r63 + ENDFUNC(GLOBAL(sdivsi3)) +/* This table has been generated by divtab.c . +Defects for bias -330: + Max defect: 6.081536e-07 at -1.000000e+00 + Min defect: 2.849516e-08 at 1.030651e+00 + Max 2nd step defect: 9.606539e-12 at -1.000000e+00 + Min 2nd step defect: 0.000000e+00 at 0.000000e+00 + Defect at 1: 1.238659e-07 + Defect at -2: 1.061708e-07 */ +#else /* ! __pic__ || ! __SHMEDIA__ */ + .section .rodata +#endif /* __pic__ */ +#if defined(TEXT_DATA_BUG) && defined(__pic__) && defined(__SHMEDIA__) + .balign 2 + .type Local_div_table,@object + .size Local_div_table,128 +/* negative division constants */ + .word -16638 + .word -17135 + .word -17737 + .word -18433 + .word -19103 + .word -19751 + .word -20583 + .word -21383 + .word -22343 + .word -23353 + .word -24407 + .word -25582 + .word -26863 + .word -28382 + .word -29965 + .word -31800 +/* negative division factors */ + .byte 66 + .byte 70 + .byte 75 + .byte 81 + .byte 87 + .byte 93 + .byte 101 + .byte 109 + .byte 119 + .byte 130 + .byte 142 + .byte 156 + .byte 172 + .byte 192 + .byte 214 + .byte 241 + .skip 16 +Local_div_table: + .skip 16 +/* positive division factors */ + .byte 241 + .byte 214 + .byte 192 + .byte 172 + .byte 156 + .byte 142 + .byte 130 + .byte 119 + .byte 109 + .byte 101 + .byte 93 + .byte 87 + .byte 81 + .byte 75 + .byte 70 + .byte 66 +/* positive division constants */ + .word 31801 + .word 29966 + .word 28383 + .word 26864 + .word 25583 + .word 24408 + .word 23354 + .word 22344 + .word 21384 + .word 20584 + .word 19752 + .word 19104 + .word 18434 + .word 17738 + .word 17136 + .word 16639 + .section .rodata +#endif /* TEXT_DATA_BUG */ + .balign 2 + .type GLOBAL(div_table),@object + .size GLOBAL(div_table),128 +/* negative division constants */ + .word -16638 + .word -17135 + .word -17737 + .word -18433 + .word -19103 + .word -19751 + .word -20583 + .word -21383 + .word -22343 + .word -23353 + .word -24407 + .word -25582 + .word -26863 + .word -28382 + .word -29965 + .word -31800 +/* negative division factors */ + .byte 66 + .byte 70 + .byte 75 + .byte 81 + .byte 87 + .byte 93 + .byte 101 + .byte 109 + .byte 119 + .byte 130 + .byte 142 + .byte 156 + .byte 172 + .byte 192 + .byte 214 + .byte 241 + .skip 16 + .global GLOBAL(div_table) +GLOBAL(div_table): + HIDDEN_ALIAS(div_table_internal,div_table) + .skip 16 +/* positive division factors */ + .byte 241 + .byte 214 + .byte 192 + .byte 172 + .byte 156 + .byte 142 + .byte 130 + .byte 119 + .byte 109 + .byte 101 + .byte 93 + .byte 87 + .byte 81 + .byte 75 + .byte 70 + .byte 66 +/* positive division constants */ + .word 31801 + .word 29966 + .word 28383 + .word 26864 + .word 25583 + .word 24408 + .word 23354 + .word 22344 + .word 21384 + .word 20584 + .word 19752 + .word 19104 + .word 18434 + .word 17738 + .word 17136 + .word 16639 + +#elif defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__) +/* This code used shld, thus is not suitable for SH1 / SH2. */ + +/* Signed / unsigned division without use of FPU, optimized for SH4. + Uses a lookup table for divisors in the range -128 .. +128, and + div1 with case distinction for larger divisors in three more ranges. + The code is lumped together with the table to allow the use of mova. */ +#ifdef __LITTLE_ENDIAN__ +#define L_LSB 0 +#define L_LSWMSB 1 +#define L_MSWLSB 2 +#else +#define L_LSB 3 +#define L_LSWMSB 2 +#define L_MSWLSB 1 +#endif + + .balign 4 + .global GLOBAL(udivsi3_i4i) + FUNC(GLOBAL(udivsi3_i4i)) +GLOBAL(udivsi3_i4i): + mov.w LOCAL(c128_w), r1 + div0u + mov r4,r0 + shlr8 r0 + cmp/hi r1,r5 + extu.w r5,r1 + bf LOCAL(udiv_le128) + cmp/eq r5,r1 + bf LOCAL(udiv_ge64k) + shlr r0 + mov r5,r1 + shll16 r5 + mov.l r4,@-r15 + div1 r5,r0 + mov.l r1,@-r15 + div1 r5,r0 + div1 r5,r0 + bra LOCAL(udiv_25) + div1 r5,r0 + +LOCAL(div_le128): + mova LOCAL(div_table_ix),r0 + bra LOCAL(div_le128_2) + mov.b @(r0,r5),r1 +LOCAL(udiv_le128): + mov.l r4,@-r15 + mova LOCAL(div_table_ix),r0 + mov.b @(r0,r5),r1 + mov.l r5,@-r15 +LOCAL(div_le128_2): + mova LOCAL(div_table_inv),r0 + mov.l @(r0,r1),r1 + mov r5,r0 + tst #0xfe,r0 + mova LOCAL(div_table_clz),r0 + dmulu.l r1,r4 + mov.b @(r0,r5),r1 + bt/s LOCAL(div_by_1) + mov r4,r0 + mov.l @r15+,r5 + sts mach,r0 + /* clrt */ + addc r4,r0 + mov.l @r15+,r4 + rotcr r0 + rts + shld r1,r0 + +LOCAL(div_by_1_neg): + neg r4,r0 +LOCAL(div_by_1): + mov.l @r15+,r5 + rts + mov.l @r15+,r4 + +LOCAL(div_ge64k): + bt/s LOCAL(div_r8) + div0u + shll8 r5 + bra LOCAL(div_ge64k_2) + div1 r5,r0 +LOCAL(udiv_ge64k): + cmp/hi r0,r5 + mov r5,r1 + bt LOCAL(udiv_r8) + shll8 r5 + mov.l r4,@-r15 + div1 r5,r0 + mov.l r1,@-r15 +LOCAL(div_ge64k_2): + div1 r5,r0 + mov.l LOCAL(zero_l),r1 + .rept 4 + div1 r5,r0 + .endr + mov.l r1,@-r15 + div1 r5,r0 + mov.w LOCAL(m256_w),r1 + div1 r5,r0 + mov.b r0,@(L_LSWMSB,r15) + xor r4,r0 + and r1,r0 + bra LOCAL(div_ge64k_end) + xor r4,r0 + +LOCAL(div_r8): + shll16 r4 + bra LOCAL(div_r8_2) + shll8 r4 +LOCAL(udiv_r8): + mov.l r4,@-r15 + shll16 r4 + clrt + shll8 r4 + mov.l r5,@-r15 +LOCAL(div_r8_2): + rotcl r4 + mov r0,r1 + div1 r5,r1 + mov r4,r0 + rotcl r0 + mov r5,r4 + div1 r5,r1 + .rept 5 + rotcl r0; div1 r5,r1 + .endr + rotcl r0 + mov.l @r15+,r5 + div1 r4,r1 + mov.l @r15+,r4 + rts + rotcl r0 + + ENDFUNC(GLOBAL(udivsi3_i4i)) + + .global GLOBAL(sdivsi3_i4i) + FUNC(GLOBAL(sdivsi3_i4i)) + /* This is link-compatible with a GLOBAL(sdivsi3) call, + but we effectively clobber only r1. */ +GLOBAL(sdivsi3_i4i): + mov.l r4,@-r15 + cmp/pz r5 + mov.w LOCAL(c128_w), r1 + bt/s LOCAL(pos_divisor) + cmp/pz r4 + mov.l r5,@-r15 + neg r5,r5 + bt/s LOCAL(neg_result) + cmp/hi r1,r5 + neg r4,r4 +LOCAL(pos_result): + extu.w r5,r0 + bf LOCAL(div_le128) + cmp/eq r5,r0 + mov r4,r0 + shlr8 r0 + bf/s LOCAL(div_ge64k) + cmp/hi r0,r5 + div0u + shll16 r5 + div1 r5,r0 + div1 r5,r0 + div1 r5,r0 +LOCAL(udiv_25): + mov.l LOCAL(zero_l),r1 + div1 r5,r0 + div1 r5,r0 + mov.l r1,@-r15 + .rept 3 + div1 r5,r0 + .endr + mov.b r0,@(L_MSWLSB,r15) + xtrct r4,r0 + swap.w r0,r0 + .rept 8 + div1 r5,r0 + .endr + mov.b r0,@(L_LSWMSB,r15) +LOCAL(div_ge64k_end): + .rept 8 + div1 r5,r0 + .endr + mov.l @r15+,r4 ! zero-extension and swap using LS unit. + extu.b r0,r0 + mov.l @r15+,r5 + or r4,r0 + mov.l @r15+,r4 + rts + rotcl r0 + +LOCAL(div_le128_neg): + tst #0xfe,r0 + mova LOCAL(div_table_ix),r0 + mov.b @(r0,r5),r1 + mova LOCAL(div_table_inv),r0 + bt/s LOCAL(div_by_1_neg) + mov.l @(r0,r1),r1 + mova LOCAL(div_table_clz),r0 + dmulu.l r1,r4 + mov.b @(r0,r5),r1 + mov.l @r15+,r5 + sts mach,r0 + /* clrt */ + addc r4,r0 + mov.l @r15+,r4 + rotcr r0 + shld r1,r0 + rts + neg r0,r0 + +LOCAL(pos_divisor): + mov.l r5,@-r15 + bt/s LOCAL(pos_result) + cmp/hi r1,r5 + neg r4,r4 +LOCAL(neg_result): + extu.w r5,r0 + bf LOCAL(div_le128_neg) + cmp/eq r5,r0 + mov r4,r0 + shlr8 r0 + bf/s LOCAL(div_ge64k_neg) + cmp/hi r0,r5 + div0u + mov.l LOCAL(zero_l),r1 + shll16 r5 + div1 r5,r0 + mov.l r1,@-r15 + .rept 7 + div1 r5,r0 + .endr + mov.b r0,@(L_MSWLSB,r15) + xtrct r4,r0 + swap.w r0,r0 + .rept 8 + div1 r5,r0 + .endr + mov.b r0,@(L_LSWMSB,r15) +LOCAL(div_ge64k_neg_end): + .rept 8 + div1 r5,r0 + .endr + mov.l @r15+,r4 ! zero-extension and swap using LS unit. + extu.b r0,r1 + mov.l @r15+,r5 + or r4,r1 +LOCAL(div_r8_neg_end): + mov.l @r15+,r4 + rotcl r1 + rts + neg r1,r0 + +LOCAL(div_ge64k_neg): + bt/s LOCAL(div_r8_neg) + div0u + shll8 r5 + mov.l LOCAL(zero_l),r1 + .rept 6 + div1 r5,r0 + .endr + mov.l r1,@-r15 + div1 r5,r0 + mov.w LOCAL(m256_w),r1 + div1 r5,r0 + mov.b r0,@(L_LSWMSB,r15) + xor r4,r0 + and r1,r0 + bra LOCAL(div_ge64k_neg_end) + xor r4,r0 + +LOCAL(c128_w): + .word 128 + +LOCAL(div_r8_neg): + clrt + shll16 r4 + mov r4,r1 + shll8 r1 + mov r5,r4 + .rept 7 + rotcl r1; div1 r5,r0 + .endr + mov.l @r15+,r5 + rotcl r1 + bra LOCAL(div_r8_neg_end) + div1 r4,r0 + +LOCAL(m256_w): + .word 0xff00 +/* This table has been generated by divtab-sh4.c. */ + .balign 4 +LOCAL(div_table_clz): + .byte 0 + .byte 1 + .byte 0 + .byte -1 + .byte -1 + .byte -2 + .byte -2 + .byte -2 + .byte -2 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -3 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -4 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -5 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 + .byte -6 +/* Lookup table translating positive divisor to index into table of + normalized inverse. N.B. the '0' entry is also the last entry of the + previous table, and causes an unaligned access for division by zero. */ +LOCAL(div_table_ix): + .byte -6 + .byte -128 + .byte -128 + .byte 0 + .byte -128 + .byte -64 + .byte 0 + .byte 64 + .byte -128 + .byte -96 + .byte -64 + .byte -32 + .byte 0 + .byte 32 + .byte 64 + .byte 96 + .byte -128 + .byte -112 + .byte -96 + .byte -80 + .byte -64 + .byte -48 + .byte -32 + .byte -16 + .byte 0 + .byte 16 + .byte 32 + .byte 48 + .byte 64 + .byte 80 + .byte 96 + .byte 112 + .byte -128 + .byte -120 + .byte -112 + .byte -104 + .byte -96 + .byte -88 + .byte -80 + .byte -72 + .byte -64 + .byte -56 + .byte -48 + .byte -40 + .byte -32 + .byte -24 + .byte -16 + .byte -8 + .byte 0 + .byte 8 + .byte 16 + .byte 24 + .byte 32 + .byte 40 + .byte 48 + .byte 56 + .byte 64 + .byte 72 + .byte 80 + .byte 88 + .byte 96 + .byte 104 + .byte 112 + .byte 120 + .byte -128 + .byte -124 + .byte -120 + .byte -116 + .byte -112 + .byte -108 + .byte -104 + .byte -100 + .byte -96 + .byte -92 + .byte -88 + .byte -84 + .byte -80 + .byte -76 + .byte -72 + .byte -68 + .byte -64 + .byte -60 + .byte -56 + .byte -52 + .byte -48 + .byte -44 + .byte -40 + .byte -36 + .byte -32 + .byte -28 + .byte -24 + .byte -20 + .byte -16 + .byte -12 + .byte -8 + .byte -4 + .byte 0 + .byte 4 + .byte 8 + .byte 12 + .byte 16 + .byte 20 + .byte 24 + .byte 28 + .byte 32 + .byte 36 + .byte 40 + .byte 44 + .byte 48 + .byte 52 + .byte 56 + .byte 60 + .byte 64 + .byte 68 + .byte 72 + .byte 76 + .byte 80 + .byte 84 + .byte 88 + .byte 92 + .byte 96 + .byte 100 + .byte 104 + .byte 108 + .byte 112 + .byte 116 + .byte 120 + .byte 124 + .byte -128 +/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */ + .balign 4 +LOCAL(zero_l): + .long 0x0 + .long 0xF81F81F9 + .long 0xF07C1F08 + .long 0xE9131AC0 + .long 0xE1E1E1E2 + .long 0xDAE6076C + .long 0xD41D41D5 + .long 0xCD856891 + .long 0xC71C71C8 + .long 0xC0E07039 + .long 0xBACF914D + .long 0xB4E81B4F + .long 0xAF286BCB + .long 0xA98EF607 + .long 0xA41A41A5 + .long 0x9EC8E952 + .long 0x9999999A + .long 0x948B0FCE + .long 0x8F9C18FA + .long 0x8ACB90F7 + .long 0x86186187 + .long 0x81818182 + .long 0x7D05F418 + .long 0x78A4C818 + .long 0x745D1746 + .long 0x702E05C1 + .long 0x6C16C16D + .long 0x68168169 + .long 0x642C8591 + .long 0x60581606 + .long 0x5C9882BA + .long 0x58ED2309 +LOCAL(div_table_inv): + .long 0x55555556 + .long 0x51D07EAF + .long 0x4E5E0A73 + .long 0x4AFD6A06 + .long 0x47AE147B + .long 0x446F8657 + .long 0x41414142 + .long 0x3E22CBCF + .long 0x3B13B13C + .long 0x38138139 + .long 0x3521CFB3 + .long 0x323E34A3 + .long 0x2F684BDB + .long 0x2C9FB4D9 + .long 0x29E4129F + .long 0x27350B89 + .long 0x24924925 + .long 0x21FB7813 + .long 0x1F7047DD + .long 0x1CF06ADB + .long 0x1A7B9612 + .long 0x18118119 + .long 0x15B1E5F8 + .long 0x135C8114 + .long 0x11111112 + .long 0xECF56BF + .long 0xC9714FC + .long 0xA6810A7 + .long 0x8421085 + .long 0x624DD30 + .long 0x4104105 + .long 0x2040811 + /* maximum error: 0.987342 scaled: 0.921875*/ + + ENDFUNC(GLOBAL(sdivsi3_i4i)) +#endif /* SH3 / SH4 */ + +#endif /* L_div_table */ + +#ifdef L_udiv_qrnnd_16 +#if !__SHMEDIA__ + HIDDEN_FUNC(GLOBAL(udiv_qrnnd_16)) + /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */ + /* n1 < d, but n1 might be larger than d1. */ + .global GLOBAL(udiv_qrnnd_16) + .balign 8 +GLOBAL(udiv_qrnnd_16): + div0u + cmp/hi r6,r0 + bt .Lots + .rept 16 + div1 r6,r0 + .endr + extu.w r0,r1 + bt 0f + add r6,r0 +0: rotcl r1 + mulu.w r1,r5 + xtrct r4,r0 + swap.w r0,r0 + sts macl,r2 + cmp/hs r2,r0 + sub r2,r0 + bt 0f + addc r5,r0 + add #-1,r1 + bt 0f +1: add #-1,r1 + rts + add r5,r0 + .balign 8 +.Lots: + sub r5,r0 + swap.w r4,r1 + xtrct r0,r1 + clrt + mov r1,r0 + addc r5,r0 + mov #-1,r1 + SL1(bf, 1b, + shlr16 r1) +0: rts + nop + ENDFUNC(GLOBAL(udiv_qrnnd_16)) +#endif /* !__SHMEDIA__ */ +#endif /* L_udiv_qrnnd_16 */ diff --git a/libgcc/config/sh/lib1funcs.h b/libgcc/config/sh/lib1funcs.h new file mode 100644 index 00000000000..af4b41cc314 --- /dev/null +++ b/libgcc/config/sh/lib1funcs.h @@ -0,0 +1,76 @@ +/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003, + 2004, 2005, 2006, 2009 + Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#ifdef __ELF__ +#define LOCAL(X) .L_##X +#define FUNC(X) .type X,@function +#define HIDDEN_FUNC(X) FUNC(X); .hidden X +#define HIDDEN_ALIAS(X,Y) ALIAS (X,Y); .hidden GLOBAL(X) +#define ENDFUNC0(X) .Lfe_##X: .size X,.Lfe_##X-X +#define ENDFUNC(X) ENDFUNC0(X) +#else +#define LOCAL(X) L_##X +#define FUNC(X) +#define HIDDEN_FUNC(X) +#define HIDDEN_ALIAS(X,Y) ALIAS (X,Y) +#define ENDFUNC(X) +#endif + +#define CONCAT(A,B) A##B +#define GLOBAL0(U,X) CONCAT(U,__##X) +#define GLOBAL(X) GLOBAL0(__USER_LABEL_PREFIX__,X) + +#define ALIAS(X,Y) .global GLOBAL(X); .set GLOBAL(X),GLOBAL(Y) + +#if defined __SH2A__ && defined __FMOVD_ENABLED__ +#undef FMOVD_WORKS +#define FMOVD_WORKS +#endif + +#ifdef __LITTLE_ENDIAN__ +#define DR00 fr1 +#define DR01 fr0 +#define DR20 fr3 +#define DR21 fr2 +#define DR40 fr5 +#define DR41 fr4 +#else /* !__LITTLE_ENDIAN__ */ +#define DR00 fr0 +#define DR01 fr1 +#define DR20 fr2 +#define DR21 fr3 +#define DR40 fr4 +#define DR41 fr5 +#endif /* !__LITTLE_ENDIAN__ */ + +#ifdef __sh1__ +#define SL(branch, dest, in_slot, in_slot_arg2) \ + in_slot, in_slot_arg2; branch dest +#define SL1(branch, dest, in_slot) \ + in_slot; branch dest +#else /* ! __sh1__ */ +#define SL(branch, dest, in_slot, in_slot_arg2) \ + branch##.s dest; in_slot, in_slot_arg2 +#define SL1(branch, dest, in_slot) \ + branch##/s dest; in_slot +#endif /* !__sh1__ */ diff --git a/libgcc/config/sh/t-linux b/libgcc/config/sh/t-linux index af618e260c6..9b1feacd1f3 100644 --- a/libgcc/config/sh/t-linux +++ b/libgcc/config/sh/t-linux @@ -1,3 +1,5 @@ +LIB1ASMFUNCS_CACHE = _ic_invalidate _ic_invalidate_array + HOST_LIBGCC2_CFLAGS = -fpic -mieee -DNO_FPSCR_VALUES # Override t-slibgcc-elf-ver to export some libgcc symbols with diff --git a/libgcc/config/sh/t-netbsd b/libgcc/config/sh/t-netbsd new file mode 100644 index 00000000000..663edbf4187 --- /dev/null +++ b/libgcc/config/sh/t-netbsd @@ -0,0 +1 @@ +LIB1ASMFUNCS_CACHE = _ic_invalidate diff --git a/libgcc/config/sh/t-sh b/libgcc/config/sh/t-sh index ab4d98089b1..2319adbef1d 100644 --- a/libgcc/config/sh/t-sh +++ b/libgcc/config/sh/t-sh @@ -17,26 +17,33 @@ # along with GCC; see the file COPYING3. If not see # <http://www.gnu.org/licenses/>. +LIB1ASMSRC = sh/lib1funcs.S +LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movmem \ + _movmem_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \ + _div_table _udiv_qrnnd_16 \ + $(LIB1ASMFUNCS_CACHE) +LIB1ASMFUNCS_CACHE = _ic_invalidate _ic_invalidate_array + crt1.o: $(srcdir)/config/sh/crt1.S $(gcc_compile) -c $< -ic_invalidate_array_4-100.o: $(gcc_srcdir)/config/sh/lib1funcs.asm +ic_invalidate_array_4-100.o: $(srcdir)/config/sh/lib1funcs.S $(gcc_compile) -c -DL_ic_invalidate_array -DWAYS=1 -DWAY_SIZE=0x2000 $< libic_invalidate_array_4-100.a: ic_invalidate_array_4-100.o $(AR_CREATE_FOR_TARGET) $@ $< -ic_invalidate_array_4-200.o: $(gcc_srcdir)/config/sh/lib1funcs.asm +ic_invalidate_array_4-200.o: $(srcdir)/config/sh/lib1funcs.S $(gcc_compile) -c -DL_ic_invalidate_array -DWAYS=2 -DWAY_SIZE=0x2000 $< libic_invalidate_array_4-200.a: ic_invalidate_array_4-200.o $(AR_CREATE_FOR_TARGET) $@ $< -ic_invalidate_array_4a.o: $(gcc_srcdir)/config/sh/lib1funcs.asm +ic_invalidate_array_4a.o: $(srcdir)/config/sh/lib1funcs.S $(gcc_compile) -c -DL_ic_invalidate_array -D__FORCE_SH4A__ $< libic_invalidate_array_4a.a: ic_invalidate_array_4a.o $(AR_CREATE_FOR_TARGET) $@ $< sdivsi3_i4i-Os-4-200.o: $(srcdir)/config/sh/lib1funcs-Os-4-200.S - $(gcc_compile) -c -DL_sdivsi3_i4i $< + $(compile) -c -DL_sdivsi3_i4i $< udivsi3_i4i-Os-4-200.o: $(srcdir)/config/sh/lib1funcs-Os-4-200.S $(gcc_compile) -c -DL_udivsi3_i4i $< unwind-dw2-Os-4-200.o: $(gcc_srcdir)/unwind-dw2.c diff --git a/libgcc/config/sh/t-sh64 b/libgcc/config/sh/t-sh64 new file mode 100644 index 00000000000..fa9950e03b2 --- /dev/null +++ b/libgcc/config/sh/t-sh64 @@ -0,0 +1,6 @@ +LIB1ASMFUNCS = \ + _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \ + _shcompact_call_trampoline _shcompact_return_trampoline \ + _shcompact_incoming_args _ic_invalidate _nested_trampoline \ + _push_pop_shmedia_regs \ + _udivdi3 _divdi3 _umoddi3 _moddi3 _div_table diff --git a/libgcc/config/sparc/lb1spc.S b/libgcc/config/sparc/lb1spc.S new file mode 100644 index 00000000000..b60bd5740e7 --- /dev/null +++ b/libgcc/config/sparc/lb1spc.S @@ -0,0 +1,784 @@ +/* This is an assembly language implementation of mulsi3, divsi3, and modsi3 + for the sparc processor. + + These routines are derived from the SPARC Architecture Manual, version 8, + slightly edited to match the desired calling convention, and also to + optimize them for our purposes. */ + +#ifdef L_mulsi3 +.text + .align 4 + .global .umul + .proc 4 +.umul: + or %o0, %o1, %o4 ! logical or of multiplier and multiplicand + mov %o0, %y ! multiplier to Y register + andncc %o4, 0xfff, %o5 ! mask out lower 12 bits + be mul_shortway ! can do it the short way + andcc %g0, %g0, %o4 ! zero the partial product and clear NV cc + ! + ! long multiply + ! + mulscc %o4, %o1, %o4 ! first iteration of 33 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 ! 32nd iteration + mulscc %o4, %g0, %o4 ! last iteration only shifts + ! the upper 32 bits of product are wrong, but we do not care + retl + rd %y, %o0 + ! + ! short multiply + ! +mul_shortway: + mulscc %o4, %o1, %o4 ! first iteration of 13 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 + mulscc %o4, %o1, %o4 ! 12th iteration + mulscc %o4, %g0, %o4 ! last iteration only shifts + rd %y, %o5 + sll %o4, 12, %o4 ! left shift partial product by 12 bits + srl %o5, 20, %o5 ! right shift partial product by 20 bits + retl + or %o5, %o4, %o0 ! merge for true product +#endif + +#ifdef L_divsi3 +/* + * Division and remainder, from Appendix E of the SPARC Version 8 + * Architecture Manual, with fixes from Gordon Irlam. + */ + +/* + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * m4 parameters: + * .div name of function to generate + * div div=div => %o0 / %o1; div=rem => %o0 % %o1 + * true true=true => signed; true=false => unsigned + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ + .global .udiv + .align 4 + .proc 4 + .text +.udiv: + b ready_to_divide + mov 0, %g3 ! result is always positive + + .global .div + .align 4 + .proc 4 + .text +.div: + ! compute sign of result; if neither is negative, no problem + orcc %o1, %o0, %g0 ! either negative? + bge ready_to_divide ! no, go do the divide + xor %o1, %o0, %g3 ! compute sign in any case + tst %o1 + bge 1f + tst %o0 + ! %o1 is definitely negative; %o0 might also be negative + bge ready_to_divide ! if %o0 not negative... + sub %g0, %o1, %o1 ! in any case, make %o1 nonneg +1: ! %o0 is negative, %o1 is nonnegative + sub %g0, %o0, %o0 ! make %o0 nonnegative + + +ready_to_divide: + + ! Ready to divide. Compute size of quotient; scale comparand. + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta 0x2 ! ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu got_result ! (and algorithm fails otherwise) + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 + blu not_really_big + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g2 + sll %o5, 4, %o5 + b 1b + add %o4, 1, %o4 + + ! Now compute %g2. + 2: addcc %o5, %o5, %o5 + bcc not_too_big + add %g2, 1, %g2 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + b do_single_div + sub %g2, 1, %g2 + + not_too_big: + 3: cmp %o5, %o3 + blu 2b + nop + be do_single_div + nop + /* NB: these are commented out in the V8-SPARC manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g2 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + do_single_div: + subcc %g2, 1, %g2 + bl end_regular_divide + nop + sub %o3, %o5, %o3 + mov 1, %o2 + b end_single_divloop + nop + single_divloop: + sll %o2, 1, %o2 + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + end_single_divloop: + subcc %g2, 1, %g2 + bge single_divloop + tst %o3 + b,a end_regular_divide + +not_really_big: +1: + sll %o5, 4, %o5 + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + be got_result + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +divloop: + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl L1.16 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl L2.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl L3.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl L4.23 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (7*2+1), %o2 + +L4.23: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + + +L3.19: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl L4.21 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +L4.21: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + +L2.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl L3.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl L4.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +L4.19: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + +L3.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl L4.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +L4.17: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + +L1.16: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl L2.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl L3.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl L4.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +L4.15: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + +L3.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl L4.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +L4.13: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + +L2.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl L3.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl L4.11 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +L4.11: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + +L3.13: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl L4.9 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +L4.9: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + 9: +end_regular_divide: + subcc %o4, 1, %o4 + bge divloop + tst %o3 + bl,a got_result + ! non-restoring fixup here (one instruction only!) + sub %o2, 1, %o2 + + +got_result: + ! check to see if answer should be < 0 + tst %g3 + bl,a 1f + sub %g0, %o2, %o2 +1: + retl + mov %o2, %o0 +#endif + +#ifdef L_modsi3 +/* This implementation was taken from glibc: + * + * Input: dividend and divisor in %o0 and %o1 respectively. + * + * Algorithm parameters: + * N how many bits per iteration we try to get (4) + * WORDSIZE total number of bits (32) + * + * Derived constants: + * TOPBITS number of bits in the top decade of a number + * + * Important variables: + * Q the partial quotient under development (initially 0) + * R the remainder so far, initially the dividend + * ITER number of main division loop iterations required; + * equal to ceil(log2(quotient) / N). Note that this + * is the log base (2^N) of the quotient. + * V the current comparand, initially divisor*2^(ITER*N-1) + * + * Cost: + * Current estimate for non-large dividend is + * ceil(log2(quotient) / N) * (10 + 7N/2) + C + * A large dividend is one greater than 2^(31-TOPBITS) and takes a + * different path, as the upper bits of the quotient must be developed + * one bit at a time. + */ +.text + .align 4 + .global .urem + .proc 4 +.urem: + b divide + mov 0, %g3 ! result always positive + + .align 4 + .global .rem + .proc 4 +.rem: + ! compute sign of result; if neither is negative, no problem + orcc %o1, %o0, %g0 ! either negative? + bge 2f ! no, go do the divide + mov %o0, %g3 ! sign of remainder matches %o0 + tst %o1 + bge 1f + tst %o0 + ! %o1 is definitely negative; %o0 might also be negative + bge 2f ! if %o0 not negative... + sub %g0, %o1, %o1 ! in any case, make %o1 nonneg +1: ! %o0 is negative, %o1 is nonnegative + sub %g0, %o0, %o0 ! make %o0 nonnegative +2: + + ! Ready to divide. Compute size of quotient; scale comparand. +divide: + orcc %o1, %g0, %o5 + bne 1f + mov %o0, %o3 + + ! Divide by zero trap. If it returns, return 0 (about as + ! wrong as possible, but that is what SunOS does...). + ta 0x2 !ST_DIV0 + retl + clr %o0 + +1: + cmp %o3, %o5 ! if %o1 exceeds %o0, done + blu got_result ! (and algorithm fails otherwise) + clr %o2 + sethi %hi(1 << (32 - 4 - 1)), %g1 + cmp %o3, %g1 + blu not_really_big + clr %o4 + + ! Here the dividend is >= 2**(31-N) or so. We must be careful here, + ! as our usual N-at-a-shot divide step will cause overflow and havoc. + ! The number of bits in the result here is N*ITER+SC, where SC <= N. + ! Compute ITER in an unorthodox manner: know we need to shift V into + ! the top decade: so do not even bother to compare to R. + 1: + cmp %o5, %g1 + bgeu 3f + mov 1, %g2 + sll %o5, 4, %o5 + b 1b + add %o4, 1, %o4 + + ! Now compute %g2. + 2: addcc %o5, %o5, %o5 + bcc not_too_big + add %g2, 1, %g2 + + ! We get here if the %o1 overflowed while shifting. + ! This means that %o3 has the high-order bit set. + ! Restore %o5 and subtract from %o3. + sll %g1, 4, %g1 ! high order bit + srl %o5, 1, %o5 ! rest of %o5 + add %o5, %g1, %o5 + b do_single_div + sub %g2, 1, %g2 + + not_too_big: + 3: cmp %o5, %o3 + blu 2b + nop + be do_single_div + nop + /* NB: these are commented out in the V8-SPARC manual as well */ + /* (I do not understand this) */ + ! %o5 > %o3: went too far: back up 1 step + ! srl %o5, 1, %o5 + ! dec %g2 + ! do single-bit divide steps + ! + ! We have to be careful here. We know that %o3 >= %o5, so we can do the + ! first divide step without thinking. BUT, the others are conditional, + ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high- + ! order bit set in the first step, just falling into the regular + ! division loop will mess up the first time around. + ! So we unroll slightly... + do_single_div: + subcc %g2, 1, %g2 + bl end_regular_divide + nop + sub %o3, %o5, %o3 + mov 1, %o2 + b end_single_divloop + nop + single_divloop: + sll %o2, 1, %o2 + bl 1f + srl %o5, 1, %o5 + ! %o3 >= 0 + sub %o3, %o5, %o3 + b 2f + add %o2, 1, %o2 + 1: ! %o3 < 0 + add %o3, %o5, %o3 + sub %o2, 1, %o2 + 2: + end_single_divloop: + subcc %g2, 1, %g2 + bge single_divloop + tst %o3 + b,a end_regular_divide + +not_really_big: +1: + sll %o5, 4, %o5 + cmp %o5, %o3 + bleu 1b + addcc %o4, 1, %o4 + be got_result + sub %o4, 1, %o4 + + tst %o3 ! set up for initial iteration +divloop: + sll %o2, 4, %o2 + ! depth 1, accumulated bits 0 + bl L1.16 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 2, accumulated bits 1 + bl L2.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits 3 + bl L3.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 7 + bl L4.23 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (7*2+1), %o2 +L4.23: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (7*2-1), %o2 + +L3.19: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 5 + bl L4.21 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (5*2+1), %o2 + +L4.21: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (5*2-1), %o2 + +L2.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits 1 + bl L3.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits 3 + bl L4.19 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (3*2+1), %o2 + +L4.19: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (3*2-1), %o2 + +L3.17: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits 1 + bl L4.17 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (1*2+1), %o2 + +L4.17: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (1*2-1), %o2 + +L1.16: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 2, accumulated bits -1 + bl L2.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 3, accumulated bits -1 + bl L3.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -1 + bl L4.15 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2+1), %o2 + +L4.15: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-1*2-1), %o2 + +L3.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -3 + bl L4.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2+1), %o2 + +L4.13: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-3*2-1), %o2 + +L2.15: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 3, accumulated bits -3 + bl L3.13 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + ! depth 4, accumulated bits -5 + bl L4.11 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2+1), %o2 + +L4.11: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-5*2-1), %o2 + +L3.13: + ! remainder is negative + addcc %o3,%o5,%o3 + ! depth 4, accumulated bits -7 + bl L4.9 + srl %o5,1,%o5 + ! remainder is positive + subcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2+1), %o2 + +L4.9: + ! remainder is negative + addcc %o3,%o5,%o3 + b 9f + add %o2, (-7*2-1), %o2 + + 9: +end_regular_divide: + subcc %o4, 1, %o4 + bge divloop + tst %o3 + bl,a got_result + ! non-restoring fixup here (one instruction only!) + add %o3, %o1, %o3 + +got_result: + ! check to see if answer should be < 0 + tst %g3 + bl,a 1f + sub %g0, %o3, %o3 +1: + retl + mov %o3, %o0 + +#endif + diff --git a/libgcc/config/sparc/t-softmul b/libgcc/config/sparc/t-softmul index 49faae47c53..7142200600f 100644 --- a/libgcc/config/sparc/t-softmul +++ b/libgcc/config/sparc/t-softmul @@ -1,2 +1,2 @@ -LIB1ASMSRC = sparc/lb1spc.asm +LIB1ASMSRC = sparc/lb1spc.S LIB1ASMFUNCS = _mulsi3 _divsi3 _modsi3 diff --git a/libgcc/config/v850/lib1funcs.S b/libgcc/config/v850/lib1funcs.S new file mode 100644 index 00000000000..04e9b1e0ad4 --- /dev/null +++ b/libgcc/config/v850/lib1funcs.S @@ -0,0 +1,2330 @@ +/* libgcc routines for NEC V850. + Copyright (C) 1996, 1997, 2002, 2005, 2009, 2010 + Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#ifdef L_mulsi3 + .text + .globl ___mulsi3 + .type ___mulsi3,@function +___mulsi3: +#ifdef __v850__ +/* + #define SHIFT 12 + #define MASK ((1 << SHIFT) - 1) + + #define STEP(i, j) \ + ({ \ + short a_part = (a >> (i)) & MASK; \ + short b_part = (b >> (j)) & MASK; \ + int res = (((int) a_part) * ((int) b_part)); \ + res; \ + }) + + int + __mulsi3 (unsigned a, unsigned b) + { + return STEP (0, 0) + + ((STEP (SHIFT, 0) + STEP (0, SHIFT)) << SHIFT) + + ((STEP (0, 2 * SHIFT) + STEP (SHIFT, SHIFT) + STEP (2 * SHIFT, 0)) + << (2 * SHIFT)); + } +*/ + mov r6, r14 + movea lo(32767), r0, r10 + and r10, r14 + mov r7, r15 + and r10, r15 + shr 15, r6 + mov r6, r13 + and r10, r13 + shr 15, r7 + mov r7, r12 + and r10, r12 + shr 15, r6 + shr 15, r7 + mov r14, r10 + mulh r15, r10 + mov r14, r11 + mulh r12, r11 + mov r13, r16 + mulh r15, r16 + mulh r14, r7 + mulh r15, r6 + add r16, r11 + mulh r13, r12 + shl 15, r11 + add r11, r10 + add r12, r7 + add r6, r7 + shl 30, r7 + add r7, r10 + jmp [r31] +#endif /* __v850__ */ +#if defined(__v850e__) || defined(__v850ea__) || defined(__v850e2__) || defined(__v850e2v3__) + /* This routine is almost unneccesarry because gcc + generates the MUL instruction for the RTX mulsi3. + But if someone wants to link his application with + previsously compiled v850 objects then they will + need this function. */ + + /* It isn't good to put the inst sequence as below; + mul r7, r6, + mov r6, r10, r0 + In this case, there is a RAW hazard between them. + MUL inst takes 2 cycle in EX stage, then MOV inst + must wait 1cycle. */ + mov r7, r10 + mul r6, r10, r0 + jmp [r31] +#endif /* __v850e__ */ + .size ___mulsi3,.-___mulsi3 +#endif /* L_mulsi3 */ + + +#ifdef L_udivsi3 + .text + .global ___udivsi3 + .type ___udivsi3,@function +___udivsi3: +#ifdef __v850__ + mov 1,r12 + mov 0,r10 + cmp r6,r7 + bnl .L12 + movhi hi(-2147483648),r0,r13 + cmp r0,r7 + blt .L12 +.L4: + shl 1,r7 + shl 1,r12 + cmp r6,r7 + bnl .L12 + cmp r0,r12 + be .L8 + mov r7,r19 + and r13,r19 + be .L4 + br .L12 +.L9: + cmp r7,r6 + bl .L10 + sub r7,r6 + or r12,r10 +.L10: + shr 1,r12 + shr 1,r7 +.L12: + cmp r0,r12 + bne .L9 +.L8: + jmp [r31] + +#else /* defined(__v850e__) */ + + /* See comments at end of __mulsi3. */ + mov r6, r10 + divu r7, r10, r0 + jmp [r31] + +#endif /* __v850e__ */ + + .size ___udivsi3,.-___udivsi3 +#endif + +#ifdef L_divsi3 + .text + .globl ___divsi3 + .type ___divsi3,@function +___divsi3: +#ifdef __v850__ + add -8,sp + st.w r31,4[sp] + st.w r22,0[sp] + mov 1,r22 + tst r7,r7 + bp .L3 + subr r0,r7 + subr r0,r22 +.L3: + tst r6,r6 + bp .L4 + subr r0,r6 + subr r0,r22 +.L4: + jarl ___udivsi3,r31 + cmp r0,r22 + bp .L7 + subr r0,r10 +.L7: + ld.w 0[sp],r22 + ld.w 4[sp],r31 + add 8,sp + jmp [r31] + +#else /* defined(__v850e__) */ + + /* See comments at end of __mulsi3. */ + mov r6, r10 + div r7, r10, r0 + jmp [r31] + +#endif /* __v850e__ */ + + .size ___divsi3,.-___divsi3 +#endif + +#ifdef L_umodsi3 + .text + .globl ___umodsi3 + .type ___umodsi3,@function +___umodsi3: +#ifdef __v850__ + add -12,sp + st.w r31,8[sp] + st.w r7,4[sp] + st.w r6,0[sp] + jarl ___udivsi3,r31 + ld.w 4[sp],r7 + mov r10,r6 + jarl ___mulsi3,r31 + ld.w 0[sp],r6 + subr r6,r10 + ld.w 8[sp],r31 + add 12,sp + jmp [r31] + +#else /* defined(__v850e__) */ + + /* See comments at end of __mulsi3. */ + divu r7, r6, r10 + jmp [r31] + +#endif /* __v850e__ */ + + .size ___umodsi3,.-___umodsi3 +#endif /* L_umodsi3 */ + +#ifdef L_modsi3 + .text + .globl ___modsi3 + .type ___modsi3,@function +___modsi3: +#ifdef __v850__ + add -12,sp + st.w r31,8[sp] + st.w r7,4[sp] + st.w r6,0[sp] + jarl ___divsi3,r31 + ld.w 4[sp],r7 + mov r10,r6 + jarl ___mulsi3,r31 + ld.w 0[sp],r6 + subr r6,r10 + ld.w 8[sp],r31 + add 12,sp + jmp [r31] + +#else /* defined(__v850e__) */ + + /* See comments at end of __mulsi3. */ + div r7, r6, r10 + jmp [r31] + +#endif /* __v850e__ */ + + .size ___modsi3,.-___modsi3 +#endif /* L_modsi3 */ + +#ifdef L_save_2 + .text + .align 2 + .globl __save_r2_r29 + .type __save_r2_r29,@function + /* Allocate space and save registers 2, 20 .. 29 on the stack. */ + /* Called via: jalr __save_r2_r29,r10. */ +__save_r2_r29: +#ifdef __EP__ + mov ep,r1 + addi -44,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + sst.w r22,28[ep] + sst.w r21,32[ep] + sst.w r20,36[ep] + sst.w r2,40[ep] + mov r1,ep +#else + addi -44,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] + st.w r22,28[sp] + st.w r21,32[sp] + st.w r20,36[sp] + st.w r2,40[sp] +#endif + jmp [r10] + .size __save_r2_r29,.-__save_r2_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r2_r29. */ + .align 2 + .globl __return_r2_r29 + .type __return_r2_r29,@function +__return_r2_r29: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + sld.w 28[ep],r22 + sld.w 32[ep],r21 + sld.w 36[ep],r20 + sld.w 40[ep],r2 + addi 44,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r23 + ld.w 28[sp],r22 + ld.w 32[sp],r21 + ld.w 36[sp],r20 + ld.w 40[sp],r2 + addi 44,sp,sp +#endif + jmp [r31] + .size __return_r2_r29,.-__return_r2_r29 +#endif /* L_save_2 */ + +#ifdef L_save_20 + .text + .align 2 + .globl __save_r20_r29 + .type __save_r20_r29,@function + /* Allocate space and save registers 20 .. 29 on the stack. */ + /* Called via: jalr __save_r20_r29,r10. */ +__save_r20_r29: +#ifdef __EP__ + mov ep,r1 + addi -40,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + sst.w r22,28[ep] + sst.w r21,32[ep] + sst.w r20,36[ep] + mov r1,ep +#else + addi -40,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] + st.w r22,28[sp] + st.w r21,32[sp] + st.w r20,36[sp] +#endif + jmp [r10] + .size __save_r20_r29,.-__save_r20_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r20_r29. */ + .align 2 + .globl __return_r20_r29 + .type __return_r20_r29,@function +__return_r20_r29: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + sld.w 28[ep],r22 + sld.w 32[ep],r21 + sld.w 36[ep],r20 + addi 40,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r23 + ld.w 28[sp],r22 + ld.w 32[sp],r21 + ld.w 36[sp],r20 + addi 40,sp,sp +#endif + jmp [r31] + .size __return_r20_r29,.-__return_r20_r29 +#endif /* L_save_20 */ + +#ifdef L_save_21 + .text + .align 2 + .globl __save_r21_r29 + .type __save_r21_r29,@function + /* Allocate space and save registers 21 .. 29 on the stack. */ + /* Called via: jalr __save_r21_r29,r10. */ +__save_r21_r29: +#ifdef __EP__ + mov ep,r1 + addi -36,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + sst.w r22,28[ep] + sst.w r21,32[ep] + mov r1,ep +#else + addi -36,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] + st.w r22,28[sp] + st.w r21,32[sp] +#endif + jmp [r10] + .size __save_r21_r29,.-__save_r21_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r21_r29. */ + .align 2 + .globl __return_r21_r29 + .type __return_r21_r29,@function +__return_r21_r29: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + sld.w 28[ep],r22 + sld.w 32[ep],r21 + addi 36,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r23 + ld.w 28[sp],r22 + ld.w 32[sp],r21 + addi 36,sp,sp +#endif + jmp [r31] + .size __return_r21_r29,.-__return_r21_r29 +#endif /* L_save_21 */ + +#ifdef L_save_22 + .text + .align 2 + .globl __save_r22_r29 + .type __save_r22_r29,@function + /* Allocate space and save registers 22 .. 29 on the stack. */ + /* Called via: jalr __save_r22_r29,r10. */ +__save_r22_r29: +#ifdef __EP__ + mov ep,r1 + addi -32,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + sst.w r22,28[ep] + mov r1,ep +#else + addi -32,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] + st.w r22,28[sp] +#endif + jmp [r10] + .size __save_r22_r29,.-__save_r22_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r22_r29. */ + .align 2 + .globl __return_r22_r29 + .type __return_r22_r29,@function +__return_r22_r29: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + sld.w 28[ep],r22 + addi 32,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r23 + ld.w 28[sp],r22 + addi 32,sp,sp +#endif + jmp [r31] + .size __return_r22_r29,.-__return_r22_r29 +#endif /* L_save_22 */ + +#ifdef L_save_23 + .text + .align 2 + .globl __save_r23_r29 + .type __save_r23_r29,@function + /* Allocate space and save registers 23 .. 29 on the stack. */ + /* Called via: jalr __save_r23_r29,r10. */ +__save_r23_r29: +#ifdef __EP__ + mov ep,r1 + addi -28,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + mov r1,ep +#else + addi -28,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] +#endif + jmp [r10] + .size __save_r23_r29,.-__save_r23_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r23_r29. */ + .align 2 + .globl __return_r23_r29 + .type __return_r23_r29,@function +__return_r23_r29: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + addi 28,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r23 + addi 28,sp,sp +#endif + jmp [r31] + .size __return_r23_r29,.-__return_r23_r29 +#endif /* L_save_23 */ + +#ifdef L_save_24 + .text + .align 2 + .globl __save_r24_r29 + .type __save_r24_r29,@function + /* Allocate space and save registers 24 .. 29 on the stack. */ + /* Called via: jalr __save_r24_r29,r10. */ +__save_r24_r29: +#ifdef __EP__ + mov ep,r1 + addi -24,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + mov r1,ep +#else + addi -24,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] +#endif + jmp [r10] + .size __save_r24_r29,.-__save_r24_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r24_r29. */ + .align 2 + .globl __return_r24_r29 + .type __return_r24_r29,@function +__return_r24_r29: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + addi 24,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + addi 24,sp,sp +#endif + jmp [r31] + .size __return_r24_r29,.-__return_r24_r29 +#endif /* L_save_24 */ + +#ifdef L_save_25 + .text + .align 2 + .globl __save_r25_r29 + .type __save_r25_r29,@function + /* Allocate space and save registers 25 .. 29 on the stack. */ + /* Called via: jalr __save_r25_r29,r10. */ +__save_r25_r29: +#ifdef __EP__ + mov ep,r1 + addi -20,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + mov r1,ep +#else + addi -20,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] +#endif + jmp [r10] + .size __save_r25_r29,.-__save_r25_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r25_r29. */ + .align 2 + .globl __return_r25_r29 + .type __return_r25_r29,@function +__return_r25_r29: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + addi 20,sp,sp + mov r1,ep +#else + ld.w 0[ep],r29 + ld.w 4[ep],r28 + ld.w 8[ep],r27 + ld.w 12[ep],r26 + ld.w 16[ep],r25 + addi 20,sp,sp +#endif + jmp [r31] + .size __return_r25_r29,.-__return_r25_r29 +#endif /* L_save_25 */ + +#ifdef L_save_26 + .text + .align 2 + .globl __save_r26_r29 + .type __save_r26_r29,@function + /* Allocate space and save registers 26 .. 29 on the stack. */ + /* Called via: jalr __save_r26_r29,r10. */ +__save_r26_r29: +#ifdef __EP__ + mov ep,r1 + add -16,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + mov r1,ep +#else + add -16,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] +#endif + jmp [r10] + .size __save_r26_r29,.-__save_r26_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r26_r29. */ + .align 2 + .globl __return_r26_r29 + .type __return_r26_r29,@function +__return_r26_r29: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + addi 16,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + addi 16,sp,sp +#endif + jmp [r31] + .size __return_r26_r29,.-__return_r26_r29 +#endif /* L_save_26 */ + +#ifdef L_save_27 + .text + .align 2 + .globl __save_r27_r29 + .type __save_r27_r29,@function + /* Allocate space and save registers 27 .. 29 on the stack. */ + /* Called via: jalr __save_r27_r29,r10. */ +__save_r27_r29: + add -12,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + jmp [r10] + .size __save_r27_r29,.-__save_r27_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r27_r29. */ + .align 2 + .globl __return_r27_r29 + .type __return_r27_r29,@function +__return_r27_r29: + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + add 12,sp + jmp [r31] + .size __return_r27_r29,.-__return_r27_r29 +#endif /* L_save_27 */ + +#ifdef L_save_28 + .text + .align 2 + .globl __save_r28_r29 + .type __save_r28_r29,@function + /* Allocate space and save registers 28,29 on the stack. */ + /* Called via: jalr __save_r28_r29,r10. */ +__save_r28_r29: + add -8,sp + st.w r29,0[sp] + st.w r28,4[sp] + jmp [r10] + .size __save_r28_r29,.-__save_r28_r29 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r28_r29. */ + .align 2 + .globl __return_r28_r29 + .type __return_r28_r29,@function +__return_r28_r29: + ld.w 0[sp],r29 + ld.w 4[sp],r28 + add 8,sp + jmp [r31] + .size __return_r28_r29,.-__return_r28_r29 +#endif /* L_save_28 */ + +#ifdef L_save_29 + .text + .align 2 + .globl __save_r29 + .type __save_r29,@function + /* Allocate space and save register 29 on the stack. */ + /* Called via: jalr __save_r29,r10. */ +__save_r29: + add -4,sp + st.w r29,0[sp] + jmp [r10] + .size __save_r29,.-__save_r29 + + /* Restore saved register 29, deallocate stack and return to the user. */ + /* Called via: jr __return_r29. */ + .align 2 + .globl __return_r29 + .type __return_r29,@function +__return_r29: + ld.w 0[sp],r29 + add 4,sp + jmp [r31] + .size __return_r29,.-__return_r29 +#endif /* L_save_28 */ + +#ifdef L_save_2c + .text + .align 2 + .globl __save_r2_r31 + .type __save_r2_r31,@function + /* Allocate space and save registers 20 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r2_r31,r10. */ +__save_r2_r31: +#ifdef __EP__ + mov ep,r1 + addi -48,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + sst.w r22,28[ep] + sst.w r21,32[ep] + sst.w r20,36[ep] + sst.w r2,40[ep] + sst.w r31,44[ep] + mov r1,ep +#else + addi -48,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] + st.w r22,28[sp] + st.w r21,32[sp] + st.w r20,36[sp] + st.w r2,40[sp] + st.w r31,44[sp] +#endif + jmp [r10] + .size __save_r2_r31,.-__save_r2_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r20_r31. */ + .align 2 + .globl __return_r2_r31 + .type __return_r2_r31,@function +__return_r2_r31: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + sld.w 28[ep],r22 + sld.w 32[ep],r21 + sld.w 36[ep],r20 + sld.w 40[ep],r2 + sld.w 44[ep],r31 + addi 48,sp,sp + mov r1,ep +#else + ld.w 44[sp],r29 + ld.w 40[sp],r28 + ld.w 36[sp],r27 + ld.w 32[sp],r26 + ld.w 28[sp],r25 + ld.w 24[sp],r24 + ld.w 20[sp],r23 + ld.w 16[sp],r22 + ld.w 12[sp],r21 + ld.w 8[sp],r20 + ld.w 4[sp],r2 + ld.w 0[sp],r31 + addi 48,sp,sp +#endif + jmp [r31] + .size __return_r2_r31,.-__return_r2_r31 +#endif /* L_save_2c */ + +#ifdef L_save_20c + .text + .align 2 + .globl __save_r20_r31 + .type __save_r20_r31,@function + /* Allocate space and save registers 20 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r20_r31,r10. */ +__save_r20_r31: +#ifdef __EP__ + mov ep,r1 + addi -44,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + sst.w r22,28[ep] + sst.w r21,32[ep] + sst.w r20,36[ep] + sst.w r31,40[ep] + mov r1,ep +#else + addi -44,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] + st.w r22,28[sp] + st.w r21,32[sp] + st.w r20,36[sp] + st.w r31,40[sp] +#endif + jmp [r10] + .size __save_r20_r31,.-__save_r20_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r20_r31. */ + .align 2 + .globl __return_r20_r31 + .type __return_r20_r31,@function +__return_r20_r31: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + sld.w 28[ep],r22 + sld.w 32[ep],r21 + sld.w 36[ep],r20 + sld.w 40[ep],r31 + addi 44,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r23 + ld.w 28[sp],r22 + ld.w 32[sp],r21 + ld.w 36[sp],r20 + ld.w 40[sp],r31 + addi 44,sp,sp +#endif + jmp [r31] + .size __return_r20_r31,.-__return_r20_r31 +#endif /* L_save_20c */ + +#ifdef L_save_21c + .text + .align 2 + .globl __save_r21_r31 + .type __save_r21_r31,@function + /* Allocate space and save registers 21 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r21_r31,r10. */ +__save_r21_r31: +#ifdef __EP__ + mov ep,r1 + addi -40,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + sst.w r22,28[ep] + sst.w r21,32[ep] + sst.w r31,36[ep] + mov r1,ep + jmp [r10] +#else + addi -40,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] + st.w r22,28[sp] + st.w r21,32[sp] + st.w r31,36[sp] + jmp [r10] +#endif + .size __save_r21_r31,.-__save_r21_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r21_r31. */ + .align 2 + .globl __return_r21_r31 + .type __return_r21_r31,@function +__return_r21_r31: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + sld.w 28[ep],r22 + sld.w 32[ep],r21 + sld.w 36[ep],r31 + addi 40,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r23 + ld.w 28[sp],r22 + ld.w 32[sp],r21 + ld.w 36[sp],r31 + addi 40,sp,sp +#endif + jmp [r31] + .size __return_r21_r31,.-__return_r21_r31 +#endif /* L_save_21c */ + +#ifdef L_save_22c + .text + .align 2 + .globl __save_r22_r31 + .type __save_r22_r31,@function + /* Allocate space and save registers 22 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r22_r31,r10. */ +__save_r22_r31: +#ifdef __EP__ + mov ep,r1 + addi -36,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + sst.w r22,28[ep] + sst.w r31,32[ep] + mov r1,ep +#else + addi -36,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] + st.w r22,28[sp] + st.w r31,32[sp] +#endif + jmp [r10] + .size __save_r22_r31,.-__save_r22_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r22_r31. */ + .align 2 + .globl __return_r22_r31 + .type __return_r22_r31,@function +__return_r22_r31: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + sld.w 28[ep],r22 + sld.w 32[ep],r31 + addi 36,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r23 + ld.w 28[sp],r22 + ld.w 32[sp],r31 + addi 36,sp,sp +#endif + jmp [r31] + .size __return_r22_r31,.-__return_r22_r31 +#endif /* L_save_22c */ + +#ifdef L_save_23c + .text + .align 2 + .globl __save_r23_r31 + .type __save_r23_r31,@function + /* Allocate space and save registers 23 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r23_r31,r10. */ +__save_r23_r31: +#ifdef __EP__ + mov ep,r1 + addi -32,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r23,24[ep] + sst.w r31,28[ep] + mov r1,ep +#else + addi -32,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r23,24[sp] + st.w r31,28[sp] +#endif + jmp [r10] + .size __save_r23_r31,.-__save_r23_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r23_r31. */ + .align 2 + .globl __return_r23_r31 + .type __return_r23_r31,@function +__return_r23_r31: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r23 + sld.w 28[ep],r31 + addi 32,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r23 + ld.w 28[sp],r31 + addi 32,sp,sp +#endif + jmp [r31] + .size __return_r23_r31,.-__return_r23_r31 +#endif /* L_save_23c */ + +#ifdef L_save_24c + .text + .align 2 + .globl __save_r24_r31 + .type __save_r24_r31,@function + /* Allocate space and save registers 24 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r24_r31,r10. */ +__save_r24_r31: +#ifdef __EP__ + mov ep,r1 + addi -28,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r24,20[ep] + sst.w r31,24[ep] + mov r1,ep +#else + addi -28,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r24,20[sp] + st.w r31,24[sp] +#endif + jmp [r10] + .size __save_r24_r31,.-__save_r24_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r24_r31. */ + .align 2 + .globl __return_r24_r31 + .type __return_r24_r31,@function +__return_r24_r31: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r24 + sld.w 24[ep],r31 + addi 28,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r24 + ld.w 24[sp],r31 + addi 28,sp,sp +#endif + jmp [r31] + .size __return_r24_r31,.-__return_r24_r31 +#endif /* L_save_24c */ + +#ifdef L_save_25c + .text + .align 2 + .globl __save_r25_r31 + .type __save_r25_r31,@function + /* Allocate space and save registers 25 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r25_r31,r10. */ +__save_r25_r31: +#ifdef __EP__ + mov ep,r1 + addi -24,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r25,16[ep] + sst.w r31,20[ep] + mov r1,ep +#else + addi -24,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r25,16[sp] + st.w r31,20[sp] +#endif + jmp [r10] + .size __save_r25_r31,.-__save_r25_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r25_r31. */ + .align 2 + .globl __return_r25_r31 + .type __return_r25_r31,@function +__return_r25_r31: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r25 + sld.w 20[ep],r31 + addi 24,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r25 + ld.w 20[sp],r31 + addi 24,sp,sp +#endif + jmp [r31] + .size __return_r25_r31,.-__return_r25_r31 +#endif /* L_save_25c */ + +#ifdef L_save_26c + .text + .align 2 + .globl __save_r26_r31 + .type __save_r26_r31,@function + /* Allocate space and save registers 26 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r26_r31,r10. */ +__save_r26_r31: +#ifdef __EP__ + mov ep,r1 + addi -20,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r26,12[ep] + sst.w r31,16[ep] + mov r1,ep +#else + addi -20,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r26,12[sp] + st.w r31,16[sp] +#endif + jmp [r10] + .size __save_r26_r31,.-__save_r26_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r26_r31. */ + .align 2 + .globl __return_r26_r31 + .type __return_r26_r31,@function +__return_r26_r31: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r26 + sld.w 16[ep],r31 + addi 20,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r26 + ld.w 16[sp],r31 + addi 20,sp,sp +#endif + jmp [r31] + .size __return_r26_r31,.-__return_r26_r31 +#endif /* L_save_26c */ + +#ifdef L_save_27c + .text + .align 2 + .globl __save_r27_r31 + .type __save_r27_r31,@function + /* Allocate space and save registers 27 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r27_r31,r10. */ +__save_r27_r31: +#ifdef __EP__ + mov ep,r1 + addi -16,sp,sp + mov sp,ep + sst.w r29,0[ep] + sst.w r28,4[ep] + sst.w r27,8[ep] + sst.w r31,12[ep] + mov r1,ep +#else + addi -16,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r27,8[sp] + st.w r31,12[sp] +#endif + jmp [r10] + .size __save_r27_r31,.-__save_r27_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r27_r31. */ + .align 2 + .globl __return_r27_r31 + .type __return_r27_r31,@function +__return_r27_r31: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 0[ep],r29 + sld.w 4[ep],r28 + sld.w 8[ep],r27 + sld.w 12[ep],r31 + addi 16,sp,sp + mov r1,ep +#else + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r27 + ld.w 12[sp],r31 + addi 16,sp,sp +#endif + jmp [r31] + .size __return_r27_r31,.-__return_r27_r31 +#endif /* L_save_27c */ + +#ifdef L_save_28c + .text + .align 2 + .globl __save_r28_r31 + .type __save_r28_r31,@function + /* Allocate space and save registers 28 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r28_r31,r10. */ +__save_r28_r31: + addi -12,sp,sp + st.w r29,0[sp] + st.w r28,4[sp] + st.w r31,8[sp] + jmp [r10] + .size __save_r28_r31,.-__save_r28_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r28_r31. */ + .align 2 + .globl __return_r28_r31 + .type __return_r28_r31,@function +__return_r28_r31: + ld.w 0[sp],r29 + ld.w 4[sp],r28 + ld.w 8[sp],r31 + addi 12,sp,sp + jmp [r31] + .size __return_r28_r31,.-__return_r28_r31 +#endif /* L_save_28c */ + +#ifdef L_save_29c + .text + .align 2 + .globl __save_r29_r31 + .type __save_r29_r31,@function + /* Allocate space and save registers 29 & 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r29_r31,r10. */ +__save_r29_r31: + addi -8,sp,sp + st.w r29,0[sp] + st.w r31,4[sp] + jmp [r10] + .size __save_r29_r31,.-__save_r29_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r29_r31. */ + .align 2 + .globl __return_r29_r31 + .type __return_r29_r31,@function +__return_r29_r31: + ld.w 0[sp],r29 + ld.w 4[sp],r31 + addi 8,sp,sp + jmp [r31] + .size __return_r29_r31,.-__return_r29_r31 +#endif /* L_save_29c */ + +#ifdef L_save_31c + .text + .align 2 + .globl __save_r31 + .type __save_r31,@function + /* Allocate space and save register 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: jalr __save_r31,r10. */ +__save_r31: + addi -4,sp,sp + st.w r31,0[sp] + jmp [r10] + .size __save_r31,.-__save_r31 + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: jr __return_r31. */ + .align 2 + .globl __return_r31 + .type __return_r31,@function +__return_r31: + ld.w 0[sp],r31 + addi 4,sp,sp + jmp [r31] + .size __return_r31,.-__return_r31 +#endif /* L_save_31c */ + +#ifdef L_save_interrupt + .text + .align 2 + .globl __save_interrupt + .type __save_interrupt,@function + /* Save registers r1, r4 on stack and load up with expected values. */ + /* Note, 20 bytes of stack have already been allocated. */ + /* Called via: jalr __save_interrupt,r10. */ +__save_interrupt: + /* add -20,sp ; st.w r11,16[sp] ; st.w r10,12[sp] ; */ + st.w ep,0[sp] + st.w gp,4[sp] + st.w r1,8[sp] + movhi hi(__ep),r0,ep + movea lo(__ep),ep,ep + movhi hi(__gp),r0,gp + movea lo(__gp),gp,gp + jmp [r10] + .size __save_interrupt,.-__save_interrupt + + /* Restore saved registers, deallocate stack and return from the interrupt. */ + /* Called via: jr __return_interrupt. */ + .align 2 + .globl __return_interrupt + .type __return_interrupt,@function +__return_interrupt: + ld.w 0[sp],ep + ld.w 4[sp],gp + ld.w 8[sp],r1 + ld.w 12[sp],r10 + ld.w 16[sp],r11 + addi 20,sp,sp + reti + .size __return_interrupt,.-__return_interrupt +#endif /* L_save_interrupt */ + +#ifdef L_save_all_interrupt + .text + .align 2 + .globl __save_all_interrupt + .type __save_all_interrupt,@function + /* Save all registers except for those saved in __save_interrupt. */ + /* Allocate enough stack for all of the registers & 16 bytes of space. */ + /* Called via: jalr __save_all_interrupt,r10. */ +__save_all_interrupt: + addi -104,sp,sp +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sst.w r31,100[ep] + sst.w r2,96[ep] + sst.w gp,92[ep] + sst.w r6,88[ep] + sst.w r7,84[ep] + sst.w r8,80[ep] + sst.w r9,76[ep] + sst.w r11,72[ep] + sst.w r12,68[ep] + sst.w r13,64[ep] + sst.w r14,60[ep] + sst.w r15,56[ep] + sst.w r16,52[ep] + sst.w r17,48[ep] + sst.w r18,44[ep] + sst.w r19,40[ep] + sst.w r20,36[ep] + sst.w r21,32[ep] + sst.w r22,28[ep] + sst.w r23,24[ep] + sst.w r24,20[ep] + sst.w r25,16[ep] + sst.w r26,12[ep] + sst.w r27,8[ep] + sst.w r28,4[ep] + sst.w r29,0[ep] + mov r1,ep +#else + st.w r31,100[sp] + st.w r2,96[sp] + st.w gp,92[sp] + st.w r6,88[sp] + st.w r7,84[sp] + st.w r8,80[sp] + st.w r9,76[sp] + st.w r11,72[sp] + st.w r12,68[sp] + st.w r13,64[sp] + st.w r14,60[sp] + st.w r15,56[sp] + st.w r16,52[sp] + st.w r17,48[sp] + st.w r18,44[sp] + st.w r19,40[sp] + st.w r20,36[sp] + st.w r21,32[sp] + st.w r22,28[sp] + st.w r23,24[sp] + st.w r24,20[sp] + st.w r25,16[sp] + st.w r26,12[sp] + st.w r27,8[sp] + st.w r28,4[sp] + st.w r29,0[sp] +#endif + jmp [r10] + .size __save_all_interrupt,.-__save_all_interrupt + + .globl __restore_all_interrupt + .type __restore_all_interrupt,@function + /* Restore all registers saved in __save_all_interrupt and + deallocate the stack space. */ + /* Called via: jalr __restore_all_interrupt,r10. */ +__restore_all_interrupt: +#ifdef __EP__ + mov ep,r1 + mov sp,ep + sld.w 100[ep],r31 + sld.w 96[ep],r2 + sld.w 92[ep],gp + sld.w 88[ep],r6 + sld.w 84[ep],r7 + sld.w 80[ep],r8 + sld.w 76[ep],r9 + sld.w 72[ep],r11 + sld.w 68[ep],r12 + sld.w 64[ep],r13 + sld.w 60[ep],r14 + sld.w 56[ep],r15 + sld.w 52[ep],r16 + sld.w 48[ep],r17 + sld.w 44[ep],r18 + sld.w 40[ep],r19 + sld.w 36[ep],r20 + sld.w 32[ep],r21 + sld.w 28[ep],r22 + sld.w 24[ep],r23 + sld.w 20[ep],r24 + sld.w 16[ep],r25 + sld.w 12[ep],r26 + sld.w 8[ep],r27 + sld.w 4[ep],r28 + sld.w 0[ep],r29 + mov r1,ep +#else + ld.w 100[sp],r31 + ld.w 96[sp],r2 + ld.w 92[sp],gp + ld.w 88[sp],r6 + ld.w 84[sp],r7 + ld.w 80[sp],r8 + ld.w 76[sp],r9 + ld.w 72[sp],r11 + ld.w 68[sp],r12 + ld.w 64[sp],r13 + ld.w 60[sp],r14 + ld.w 56[sp],r15 + ld.w 52[sp],r16 + ld.w 48[sp],r17 + ld.w 44[sp],r18 + ld.w 40[sp],r19 + ld.w 36[sp],r20 + ld.w 32[sp],r21 + ld.w 28[sp],r22 + ld.w 24[sp],r23 + ld.w 20[sp],r24 + ld.w 16[sp],r25 + ld.w 12[sp],r26 + ld.w 8[sp],r27 + ld.w 4[sp],r28 + ld.w 0[sp],r29 +#endif + addi 104,sp,sp + jmp [r10] + .size __restore_all_interrupt,.-__restore_all_interrupt +#endif /* L_save_all_interrupt */ + +#if defined(__v850e__) || defined(__v850e1__) || defined(__v850e2__) || defined(__v850e2v3__) +#ifdef L_callt_save_r2_r29 + /* Put these functions into the call table area. */ + .call_table_text + + /* Allocate space and save registers 2, 20 .. 29 on the stack. */ + /* Called via: callt ctoff(__callt_save_r2_r29). */ + .align 2 +.L_save_r2_r29: + add -4, sp + st.w r2, 0[sp] + prepare {r20 - r29}, 0 + ctret + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: callt ctoff(__callt_return_r2_r29). */ + .align 2 +.L_return_r2_r29: + dispose 0, {r20-r29} + ld.w 0[sp], r2 + add 4, sp + jmp [r31] + + /* Place the offsets of the start of these routines into the call table. */ + .call_table_data + + .global __callt_save_r2_r29 + .type __callt_save_r2_r29,@function +__callt_save_r2_r29: .short ctoff(.L_save_r2_r29) + + .global __callt_return_r2_r29 + .type __callt_return_r2_r29,@function +__callt_return_r2_r29: .short ctoff(.L_return_r2_r29) + +#endif /* L_callt_save_r2_r29. */ + +#ifdef L_callt_save_r2_r31 + /* Put these functions into the call table area. */ + .call_table_text + + /* Allocate space and save registers 2 and 20 .. 29, 31 on the stack. */ + /* Also allocate space for the argument save area. */ + /* Called via: callt ctoff(__callt_save_r2_r31). */ + .align 2 +.L_save_r2_r31: + add -4, sp + st.w r2, 0[sp] + prepare {r20 - r29, r31}, 0 + ctret + + /* Restore saved registers, deallocate stack and return to the user. */ + /* Called via: callt ctoff(__callt_return_r2_r31). */ + .align 2 +.L_return_r2_r31: + dispose 0, {r20 - r29, r31} + ld.w 0[sp], r2 + addi 4, sp, sp + jmp [r31] + + /* Place the offsets of the start of these routines into the call table. */ + .call_table_data + + .global __callt_save_r2_r31 + .type __callt_save_r2_r31,@function +__callt_save_r2_r31: .short ctoff(.L_save_r2_r31) + + .global __callt_return_r2_r31 + .type __callt_return_r2_r31,@function +__callt_return_r2_r31: .short ctoff(.L_return_r2_r31) + +#endif /* L_callt_save_r2_r31 */ + +#ifdef L_callt_save_interrupt + /* Put these functions into the call table area. */ + .call_table_text + + /* Save registers r1, ep, gp, r10 on stack and load up with expected values. */ + /* Called via: callt ctoff(__callt_save_interrupt). */ + .align 2 +.L_save_interrupt: + /* SP has already been moved before callt ctoff(_save_interrupt). */ + /* R1,R10,R11,ctpc,ctpsw has alread been saved bofore callt ctoff(_save_interrupt). */ + /* addi -28, sp, sp */ + /* st.w r1, 24[sp] */ + /* st.w r10, 12[sp] */ + /* st.w r11, 16[sp] */ + /* stsr ctpc, r10 */ + /* st.w r10, 20[sp] */ + /* stsr ctpsw, r10 */ + /* st.w r10, 24[sp] */ + st.w ep, 0[sp] + st.w gp, 4[sp] + st.w r1, 8[sp] + mov hilo(__ep),ep + mov hilo(__gp),gp + ctret + + .call_table_text + /* Restore saved registers, deallocate stack and return from the interrupt. */ + /* Called via: callt ctoff(__callt_restore_interrupt). */ + .align 2 + .globl __return_interrupt + .type __return_interrupt,@function +.L_return_interrupt: + ld.w 24[sp], r1 + ldsr r1, ctpsw + ld.w 20[sp], r1 + ldsr r1, ctpc + ld.w 16[sp], r11 + ld.w 12[sp], r10 + ld.w 8[sp], r1 + ld.w 4[sp], gp + ld.w 0[sp], ep + addi 28, sp, sp + reti + + /* Place the offsets of the start of these routines into the call table. */ + .call_table_data + + .global __callt_save_interrupt + .type __callt_save_interrupt,@function +__callt_save_interrupt: .short ctoff(.L_save_interrupt) + + .global __callt_return_interrupt + .type __callt_return_interrupt,@function +__callt_return_interrupt: .short ctoff(.L_return_interrupt) + +#endif /* L_callt_save_interrupt */ + +#ifdef L_callt_save_all_interrupt + /* Put these functions into the call table area. */ + .call_table_text + + /* Save all registers except for those saved in __save_interrupt. */ + /* Allocate enough stack for all of the registers & 16 bytes of space. */ + /* Called via: callt ctoff(__callt_save_all_interrupt). */ + .align 2 +.L_save_all_interrupt: + addi -60, sp, sp +#ifdef __EP__ + mov ep, r1 + mov sp, ep + sst.w r2, 56[ep] + sst.w r5, 52[ep] + sst.w r6, 48[ep] + sst.w r7, 44[ep] + sst.w r8, 40[ep] + sst.w r9, 36[ep] + sst.w r11, 32[ep] + sst.w r12, 28[ep] + sst.w r13, 24[ep] + sst.w r14, 20[ep] + sst.w r15, 16[ep] + sst.w r16, 12[ep] + sst.w r17, 8[ep] + sst.w r18, 4[ep] + sst.w r19, 0[ep] + mov r1, ep +#else + st.w r2, 56[sp] + st.w r5, 52[sp] + st.w r6, 48[sp] + st.w r7, 44[sp] + st.w r8, 40[sp] + st.w r9, 36[sp] + st.w r11, 32[sp] + st.w r12, 28[sp] + st.w r13, 24[sp] + st.w r14, 20[sp] + st.w r15, 16[sp] + st.w r16, 12[sp] + st.w r17, 8[sp] + st.w r18, 4[sp] + st.w r19, 0[sp] +#endif + prepare {r20 - r29, r31}, 0 + ctret + + /* Restore all registers saved in __save_all_interrupt + deallocate the stack space. */ + /* Called via: callt ctoff(__callt_restore_all_interrupt). */ + .align 2 +.L_restore_all_interrupt: + dispose 0, {r20 - r29, r31} +#ifdef __EP__ + mov ep, r1 + mov sp, ep + sld.w 0 [ep], r19 + sld.w 4 [ep], r18 + sld.w 8 [ep], r17 + sld.w 12[ep], r16 + sld.w 16[ep], r15 + sld.w 20[ep], r14 + sld.w 24[ep], r13 + sld.w 28[ep], r12 + sld.w 32[ep], r11 + sld.w 36[ep], r9 + sld.w 40[ep], r8 + sld.w 44[ep], r7 + sld.w 48[ep], r6 + sld.w 52[ep], r5 + sld.w 56[ep], r2 + mov r1, ep +#else + ld.w 0 [sp], r19 + ld.w 4 [sp], r18 + ld.w 8 [sp], r17 + ld.w 12[sp], r16 + ld.w 16[sp], r15 + ld.w 20[sp], r14 + ld.w 24[sp], r13 + ld.w 28[sp], r12 + ld.w 32[sp], r11 + ld.w 36[sp], r9 + ld.w 40[sp], r8 + ld.w 44[sp], r7 + ld.w 48[sp], r6 + ld.w 52[sp], r5 + ld.w 56[sp], r2 +#endif + addi 60, sp, sp + ctret + + /* Place the offsets of the start of these routines into the call table. */ + .call_table_data + + .global __callt_save_all_interrupt + .type __callt_save_all_interrupt,@function +__callt_save_all_interrupt: .short ctoff(.L_save_all_interrupt) + + .global __callt_restore_all_interrupt + .type __callt_restore_all_interrupt,@function +__callt_restore_all_interrupt: .short ctoff(.L_restore_all_interrupt) + +#endif /* L_callt_save_all_interrupt */ + + +#define MAKE_CALLT_FUNCS( START ) \ + .call_table_text ;\ + .align 2 ;\ + /* Allocate space and save registers START .. r29 on the stack. */ ;\ + /* Called via: callt ctoff(__callt_save_START_r29). */ ;\ +.L_save_##START##_r29: ;\ + prepare { START - r29 }, 0 ;\ + ctret ;\ + ;\ + /* Restore saved registers, deallocate stack and return. */ ;\ + /* Called via: callt ctoff(__return_START_r29). */ ;\ + .align 2 ;\ +.L_return_##START##_r29: ;\ + dispose 0, { START - r29 }, r31 ;\ + ;\ + /* Place the offsets of the start of these funcs into the call table. */;\ + .call_table_data ;\ + ;\ + .global __callt_save_##START##_r29 ;\ + .type __callt_save_##START##_r29,@function ;\ +__callt_save_##START##_r29: .short ctoff(.L_save_##START##_r29 ) ;\ + ;\ + .global __callt_return_##START##_r29 ;\ + .type __callt_return_##START##_r29,@function ;\ +__callt_return_##START##_r29: .short ctoff(.L_return_##START##_r29 ) + + +#define MAKE_CALLT_CFUNCS( START ) \ + .call_table_text ;\ + .align 2 ;\ + /* Allocate space and save registers START .. r31 on the stack. */ ;\ + /* Called via: callt ctoff(__callt_save_START_r31c). */ ;\ +.L_save_##START##_r31c: ;\ + prepare { START - r29, r31}, 0 ;\ + ctret ;\ + ;\ + /* Restore saved registers, deallocate stack and return. */ ;\ + /* Called via: callt ctoff(__return_START_r31c). */ ;\ + .align 2 ;\ +.L_return_##START##_r31c: ;\ + dispose 0, { START - r29, r31}, r31 ;\ + ;\ + /* Place the offsets of the start of these funcs into the call table. */;\ + .call_table_data ;\ + ;\ + .global __callt_save_##START##_r31c ;\ + .type __callt_save_##START##_r31c,@function ;\ +__callt_save_##START##_r31c: .short ctoff(.L_save_##START##_r31c ) ;\ + ;\ + .global __callt_return_##START##_r31c ;\ + .type __callt_return_##START##_r31c,@function ;\ +__callt_return_##START##_r31c: .short ctoff(.L_return_##START##_r31c ) + + +#ifdef L_callt_save_20 + MAKE_CALLT_FUNCS (r20) +#endif +#ifdef L_callt_save_21 + MAKE_CALLT_FUNCS (r21) +#endif +#ifdef L_callt_save_22 + MAKE_CALLT_FUNCS (r22) +#endif +#ifdef L_callt_save_23 + MAKE_CALLT_FUNCS (r23) +#endif +#ifdef L_callt_save_24 + MAKE_CALLT_FUNCS (r24) +#endif +#ifdef L_callt_save_25 + MAKE_CALLT_FUNCS (r25) +#endif +#ifdef L_callt_save_26 + MAKE_CALLT_FUNCS (r26) +#endif +#ifdef L_callt_save_27 + MAKE_CALLT_FUNCS (r27) +#endif +#ifdef L_callt_save_28 + MAKE_CALLT_FUNCS (r28) +#endif +#ifdef L_callt_save_29 + MAKE_CALLT_FUNCS (r29) +#endif + +#ifdef L_callt_save_20c + MAKE_CALLT_CFUNCS (r20) +#endif +#ifdef L_callt_save_21c + MAKE_CALLT_CFUNCS (r21) +#endif +#ifdef L_callt_save_22c + MAKE_CALLT_CFUNCS (r22) +#endif +#ifdef L_callt_save_23c + MAKE_CALLT_CFUNCS (r23) +#endif +#ifdef L_callt_save_24c + MAKE_CALLT_CFUNCS (r24) +#endif +#ifdef L_callt_save_25c + MAKE_CALLT_CFUNCS (r25) +#endif +#ifdef L_callt_save_26c + MAKE_CALLT_CFUNCS (r26) +#endif +#ifdef L_callt_save_27c + MAKE_CALLT_CFUNCS (r27) +#endif +#ifdef L_callt_save_28c + MAKE_CALLT_CFUNCS (r28) +#endif +#ifdef L_callt_save_29c + MAKE_CALLT_CFUNCS (r29) +#endif + + +#ifdef L_callt_save_31c + .call_table_text + .align 2 + /* Allocate space and save register r31 on the stack. */ + /* Called via: callt ctoff(__callt_save_r31c). */ +.L_callt_save_r31c: + prepare {r31}, 0 + ctret + + /* Restore saved registers, deallocate stack and return. */ + /* Called via: callt ctoff(__return_r31c). */ + .align 2 +.L_callt_return_r31c: + dispose 0, {r31}, r31 + + /* Place the offsets of the start of these funcs into the call table. */ + .call_table_data + + .global __callt_save_r31c + .type __callt_save_r31c,@function +__callt_save_r31c: .short ctoff(.L_callt_save_r31c) + + .global __callt_return_r31c + .type __callt_return_r31c,@function +__callt_return_r31c: .short ctoff(.L_callt_return_r31c) +#endif + +#endif /* __v850e__ */ + +/* libgcc2 routines for NEC V850. */ +/* Double Integer Arithmetical Operation. */ + +#ifdef L_negdi2 + .text + .global ___negdi2 + .type ___negdi2, @function +___negdi2: + not r6, r10 + add 1, r10 + setf l, r6 + not r7, r11 + add r6, r11 + jmp [lp] + + .size ___negdi2,.-___negdi2 +#endif + +#ifdef L_cmpdi2 + .text + .global ___cmpdi2 + .type ___cmpdi2,@function +___cmpdi2: + # Signed comparison bitween each high word. + cmp r9, r7 + be .L_cmpdi_cmp_low + setf ge, r10 + setf gt, r6 + add r6, r10 + jmp [lp] +.L_cmpdi_cmp_low: + # Unsigned comparigon bitween each low word. + cmp r8, r6 + setf nl, r10 + setf h, r6 + add r6, r10 + jmp [lp] + .size ___cmpdi2, . - ___cmpdi2 +#endif + +#ifdef L_ucmpdi2 + .text + .global ___ucmpdi2 + .type ___ucmpdi2,@function +___ucmpdi2: + cmp r9, r7 # Check if each high word are same. + bne .L_ucmpdi_check_psw + cmp r8, r6 # Compare the word. +.L_ucmpdi_check_psw: + setf nl, r10 # + setf h, r6 # + add r6, r10 # Add the result of comparison NL and comparison H. + jmp [lp] + .size ___ucmpdi2, . - ___ucmpdi2 +#endif + +#ifdef L_muldi3 + .text + .global ___muldi3 + .type ___muldi3,@function +___muldi3: +#ifdef __v850__ + jarl __save_r26_r31, r10 + addi 16, sp, sp + mov r6, r28 + shr 15, r28 + movea lo(32767), r0, r14 + and r14, r28 + mov r8, r10 + shr 15, r10 + and r14, r10 + mov r6, r19 + shr 30, r19 + mov r7, r12 + shl 2, r12 + or r12, r19 + and r14, r19 + mov r8, r13 + shr 30, r13 + mov r9, r12 + shl 2, r12 + or r12, r13 + and r14, r13 + mov r7, r11 + shr 13, r11 + and r14, r11 + mov r9, r31 + shr 13, r31 + and r14, r31 + mov r7, r29 + shr 28, r29 + and r14, r29 + mov r9, r12 + shr 28, r12 + and r14, r12 + and r14, r6 + and r14, r8 + mov r6, r14 + mulh r8, r14 + mov r6, r16 + mulh r10, r16 + mov r6, r18 + mulh r13, r18 + mov r6, r15 + mulh r31, r15 + mulh r12, r6 + mov r28, r17 + mulh r10, r17 + add -16, sp + mov r28, r12 + mulh r8, r12 + add r17, r18 + mov r28, r17 + mulh r31, r17 + add r12, r16 + mov r28, r12 + mulh r13, r12 + add r17, r6 + mov r19, r17 + add r12, r15 + mov r19, r12 + mulh r8, r12 + mulh r10, r17 + add r12, r18 + mov r19, r12 + mulh r13, r12 + add r17, r15 + mov r11, r13 + mulh r8, r13 + add r12, r6 + mov r11, r12 + mulh r10, r12 + add r13, r15 + mulh r29, r8 + add r12, r6 + mov r16, r13 + shl 15, r13 + add r14, r13 + mov r18, r12 + shl 30, r12 + mov r13, r26 + add r12, r26 + shr 15, r14 + movhi hi(131071), r0, r12 + movea lo(131071), r12, r13 + and r13, r14 + mov r16, r12 + and r13, r12 + add r12, r14 + mov r18, r12 + shl 15, r12 + and r13, r12 + add r12, r14 + shr 17, r14 + shr 17, r16 + add r14, r16 + shl 13, r15 + shr 2, r18 + add r18, r15 + add r15, r16 + mov r16, r27 + add r8, r6 + shl 28, r6 + add r6, r27 + mov r26, r10 + mov r27, r11 + jr __return_r26_r31 +#else /* defined(__v850e__) */ + /* (Ahi << 32 + Alo) * (Bhi << 32 + Blo) */ + /* r7 r6 r9 r8 */ + mov r8, r10 + mulu r7, r8, r0 /* Ahi * Blo */ + mulu r6, r9, r0 /* Alo * Bhi */ + mulu r6, r10, r11 /* Alo * Blo */ + add r8, r11 + add r9, r11 + jmp [r31] +#endif /* defined(__v850e__) */ + .size ___muldi3, . - ___muldi3 +#endif + diff --git a/libgcc/config/v850/t-v850 b/libgcc/config/v850/t-v850 new file mode 100644 index 00000000000..b61703ace09 --- /dev/null +++ b/libgcc/config/v850/t-v850 @@ -0,0 +1,60 @@ +LIB1ASMSRC = v850/lib1funcs.S +LIB1ASMFUNCS = _mulsi3 \ + _divsi3 \ + _udivsi3 \ + _modsi3 \ + _umodsi3 \ + _save_2 \ + _save_20 \ + _save_21 \ + _save_22 \ + _save_23 \ + _save_24 \ + _save_25 \ + _save_26 \ + _save_27 \ + _save_28 \ + _save_29 \ + _save_2c \ + _save_20c \ + _save_21c \ + _save_22c \ + _save_23c \ + _save_24c \ + _save_25c \ + _save_26c \ + _save_27c \ + _save_28c \ + _save_29c \ + _save_31c \ + _save_interrupt \ + _save_all_interrupt \ + _callt_save_20 \ + _callt_save_21 \ + _callt_save_22 \ + _callt_save_23 \ + _callt_save_24 \ + _callt_save_25 \ + _callt_save_26 \ + _callt_save_27 \ + _callt_save_28 \ + _callt_save_29 \ + _callt_save_20c \ + _callt_save_21c \ + _callt_save_22c \ + _callt_save_23c \ + _callt_save_24c \ + _callt_save_25c \ + _callt_save_26c \ + _callt_save_27c \ + _callt_save_28c \ + _callt_save_29c \ + _callt_save_31c \ + _callt_save_interrupt \ + _callt_save_all_interrupt \ + _callt_save_r2_r29 \ + _callt_save_r2_r31 \ + _negdi2 \ + _cmpdi2 \ + _ucmpdi2 \ + _muldi3 diff --git a/libgcc/config/vax/lib1funcs.S b/libgcc/config/vax/lib1funcs.S new file mode 100644 index 00000000000..1d57b56dad9 --- /dev/null +++ b/libgcc/config/vax/lib1funcs.S @@ -0,0 +1,92 @@ +/* Copyright (C) 2009 Free Software Foundation, Inc. + This file is part of GCC. + Contributed by Maciej W. Rozycki <macro@linux-mips.org>. + + This file is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 3, or (at your option) any + later version. + + This file is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef L_udivsi3 + .text + .globl __udivsi3 + .type __udivsi3, @function +__udivsi3: + .word 0 + movl 8(%ap), %r1 + blss 0f /* Check bit #31 of divisor. */ + movl 4(%ap), %r2 + blss 1f /* Check bit #31 of dividend. */ + + /* Both zero, do a standard division. */ + + divl3 %r1, %r2, %r0 + ret + + /* MSB of divisor set, only 1 or 0 may result. */ +0: + decl %r1 + clrl %r0 + cmpl %r1, 4(%ap) + adwc $0, %r0 + ret + + /* MSB of dividend set, do an extended division. */ +1: + clrl %r3 + ediv %r1, %r2, %r0, %r3 + ret + .size __udivsi3, . - __udivsi3 + .previous +#endif + +#ifdef L_umodsi3 + .text + .globl __umodsi3 + .type __umodsi3, @function +__umodsi3: + .word 0 + movl 8(%ap), %r1 + blss 0f /* Check bit #31 of divisor. */ + movl 4(%ap), %r2 + blss 1f /* Check bit #31 of dividend. */ + + /* Both zero, do a standard division. */ + + divl3 %r1, %r2, %r0 + mull2 %r0, %r1 + subl3 %r1, %r2, %r0 + ret + + /* MSB of divisor set, subtract the divisor at most once. */ +0: + movl 4(%ap), %r2 + clrl %r0 + cmpl %r2, %r1 + sbwc $0, %r0 + bicl2 %r0, %r1 + subl3 %r1, %r2, %r0 + ret + + /* MSB of dividend set, do an extended division. */ +1: + clrl %r3 + ediv %r1, %r2, %r3, %r0 + ret + .size __umodsi3, . - __umodsi3 + .previous +#endif diff --git a/libgcc/config/vax/t-linux b/libgcc/config/vax/t-linux new file mode 100644 index 00000000000..17929c8717c --- /dev/null +++ b/libgcc/config/vax/t-linux @@ -0,0 +1,2 @@ +LIB1ASMSRC = vax/lib1funcs.S +LIB1ASMFUNCS = _udivsi3 _umodsi3 diff --git a/libgcc/config/xtensa/ieee754-df.S b/libgcc/config/xtensa/ieee754-df.S new file mode 100644 index 00000000000..9b46889bdc2 --- /dev/null +++ b/libgcc/config/xtensa/ieee754-df.S @@ -0,0 +1,2388 @@ +/* IEEE-754 double-precision functions for Xtensa + Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc. + Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef __XTENSA_EB__ +#define xh a2 +#define xl a3 +#define yh a4 +#define yl a5 +#else +#define xh a3 +#define xl a2 +#define yh a5 +#define yl a4 +#endif + +/* Warning! The branch displacements for some Xtensa branch instructions + are quite small, and this code has been carefully laid out to keep + branch targets in range. If you change anything, be sure to check that + the assembler is not relaxing anything to branch over a jump. */ + +#ifdef L_negdf2 + + .align 4 + .global __negdf2 + .type __negdf2, @function +__negdf2: + leaf_entry sp, 16 + movi a4, 0x80000000 + xor xh, xh, a4 + leaf_return + +#endif /* L_negdf2 */ + +#ifdef L_addsubdf3 + + /* Addition */ +__adddf3_aux: + + /* Handle NaNs and Infinities. (This code is placed before the + start of the function just to keep it in range of the limited + branch displacements.) */ + +.Ladd_xnan_or_inf: + /* If y is neither Infinity nor NaN, return x. */ + bnall yh, a6, 1f + /* If x is a NaN, return it. Otherwise, return y. */ + slli a7, xh, 12 + or a7, a7, xl + beqz a7, .Ladd_ynan_or_inf +1: leaf_return + +.Ladd_ynan_or_inf: + /* Return y. */ + mov xh, yh + mov xl, yl + leaf_return + +.Ladd_opposite_signs: + /* Operand signs differ. Do a subtraction. */ + slli a7, a6, 11 + xor yh, yh, a7 + j .Lsub_same_sign + + .align 4 + .global __adddf3 + .type __adddf3, @function +__adddf3: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + + /* Check if the two operands have the same sign. */ + xor a7, xh, yh + bltz a7, .Ladd_opposite_signs + +.Ladd_same_sign: + /* Check if either exponent == 0x7ff (i.e., NaN or Infinity). */ + ball xh, a6, .Ladd_xnan_or_inf + ball yh, a6, .Ladd_ynan_or_inf + + /* Compare the exponents. The smaller operand will be shifted + right by the exponent difference and added to the larger + one. */ + extui a7, xh, 20, 12 + extui a8, yh, 20, 12 + bltu a7, a8, .Ladd_shiftx + +.Ladd_shifty: + /* Check if the smaller (or equal) exponent is zero. */ + bnone yh, a6, .Ladd_yexpzero + + /* Replace yh sign/exponent with 0x001. */ + or yh, yh, a6 + slli yh, yh, 11 + srli yh, yh, 11 + +.Ladd_yexpdiff: + /* Compute the exponent difference. Optimize for difference < 32. */ + sub a10, a7, a8 + bgeui a10, 32, .Ladd_bigshifty + + /* Shift yh/yl right by the exponent difference. Any bits that are + shifted out of yl are saved in a9 for rounding the result. */ + ssr a10 + movi a9, 0 + src a9, yl, a9 + src yl, yh, yl + srl yh, yh + +.Ladd_addy: + /* Do the 64-bit addition. */ + add xl, xl, yl + add xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, 1 +1: + /* Check if the add overflowed into the exponent. */ + extui a10, xh, 20, 12 + beq a10, a7, .Ladd_round + mov a8, a7 + j .Ladd_carry + +.Ladd_yexpzero: + /* y is a subnormal value. Replace its sign/exponent with zero, + i.e., no implicit "1.0", and increment the apparent exponent + because subnormals behave as if they had the minimum (nonzero) + exponent. Test for the case when both exponents are zero. */ + slli yh, yh, 12 + srli yh, yh, 12 + bnone xh, a6, .Ladd_bothexpzero + addi a8, a8, 1 + j .Ladd_yexpdiff + +.Ladd_bothexpzero: + /* Both exponents are zero. Handle this as a special case. There + is no need to shift or round, and the normal code for handling + a carry into the exponent field will not work because it + assumes there is an implicit "1.0" that needs to be added. */ + add xl, xl, yl + add xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, 1 +1: leaf_return + +.Ladd_bigshifty: + /* Exponent difference > 64 -- just return the bigger value. */ + bgeui a10, 64, 1b + + /* Shift yh/yl right by the exponent difference. Any bits that are + shifted out are saved in a9 for rounding the result. */ + ssr a10 + sll a11, yl /* lost bits shifted out of yl */ + src a9, yh, yl + srl yl, yh + movi yh, 0 + beqz a11, .Ladd_addy + or a9, a9, a10 /* any positive, nonzero value will work */ + j .Ladd_addy + +.Ladd_xexpzero: + /* Same as "yexpzero" except skip handling the case when both + exponents are zero. */ + slli xh, xh, 12 + srli xh, xh, 12 + addi a7, a7, 1 + j .Ladd_xexpdiff + +.Ladd_shiftx: + /* Same thing as the "shifty" code, but with x and y swapped. Also, + because the exponent difference is always nonzero in this version, + the shift sequence can use SLL and skip loading a constant zero. */ + bnone xh, a6, .Ladd_xexpzero + + or xh, xh, a6 + slli xh, xh, 11 + srli xh, xh, 11 + +.Ladd_xexpdiff: + sub a10, a8, a7 + bgeui a10, 32, .Ladd_bigshiftx + + ssr a10 + sll a9, xl + src xl, xh, xl + srl xh, xh + +.Ladd_addx: + add xl, xl, yl + add xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, 1 +1: + /* Check if the add overflowed into the exponent. */ + extui a10, xh, 20, 12 + bne a10, a8, .Ladd_carry + +.Ladd_round: + /* Round up if the leftover fraction is >= 1/2. */ + bgez a9, 1f + addi xl, xl, 1 + beqz xl, .Ladd_roundcarry + + /* Check if the leftover fraction is exactly 1/2. */ + slli a9, a9, 1 + beqz a9, .Ladd_exactlyhalf +1: leaf_return + +.Ladd_bigshiftx: + /* Mostly the same thing as "bigshifty".... */ + bgeui a10, 64, .Ladd_returny + + ssr a10 + sll a11, xl + src a9, xh, xl + srl xl, xh + movi xh, 0 + beqz a11, .Ladd_addx + or a9, a9, a10 + j .Ladd_addx + +.Ladd_returny: + mov xh, yh + mov xl, yl + leaf_return + +.Ladd_carry: + /* The addition has overflowed into the exponent field, so the + value needs to be renormalized. The mantissa of the result + can be recovered by subtracting the original exponent and + adding 0x100000 (which is the explicit "1.0" for the + mantissa of the non-shifted operand -- the "1.0" for the + shifted operand was already added). The mantissa can then + be shifted right by one bit. The explicit "1.0" of the + shifted mantissa then needs to be replaced by the exponent, + incremented by one to account for the normalizing shift. + It is faster to combine these operations: do the shift first + and combine the additions and subtractions. If x is the + original exponent, the result is: + shifted mantissa - (x << 19) + (1 << 19) + (x << 20) + or: + shifted mantissa + ((x + 1) << 19) + Note that the exponent is incremented here by leaving the + explicit "1.0" of the mantissa in the exponent field. */ + + /* Shift xh/xl right by one bit. Save the lsb of xl. */ + mov a10, xl + ssai 1 + src xl, xh, xl + srl xh, xh + + /* See explanation above. The original exponent is in a8. */ + addi a8, a8, 1 + slli a8, a8, 19 + add xh, xh, a8 + + /* Return an Infinity if the exponent overflowed. */ + ball xh, a6, .Ladd_infinity + + /* Same thing as the "round" code except the msb of the leftover + fraction is bit 0 of a10, with the rest of the fraction in a9. */ + bbci.l a10, 0, 1f + addi xl, xl, 1 + beqz xl, .Ladd_roundcarry + beqz a9, .Ladd_exactlyhalf +1: leaf_return + +.Ladd_infinity: + /* Clear the mantissa. */ + movi xl, 0 + srli xh, xh, 20 + slli xh, xh, 20 + + /* The sign bit may have been lost in a carry-out. Put it back. */ + slli a8, a8, 1 + or xh, xh, a8 + leaf_return + +.Ladd_exactlyhalf: + /* Round down to the nearest even value. */ + srli xl, xl, 1 + slli xl, xl, 1 + leaf_return + +.Ladd_roundcarry: + /* xl is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi xh, xh, 1 + /* Overflow to the exponent is OK. */ + leaf_return + + + /* Subtraction */ +__subdf3_aux: + + /* Handle NaNs and Infinities. (This code is placed before the + start of the function just to keep it in range of the limited + branch displacements.) */ + +.Lsub_xnan_or_inf: + /* If y is neither Infinity nor NaN, return x. */ + bnall yh, a6, 1f + /* Both x and y are either NaN or Inf, so the result is NaN. */ + movi a4, 0x80000 /* make it a quiet NaN */ + or xh, xh, a4 +1: leaf_return + +.Lsub_ynan_or_inf: + /* Negate y and return it. */ + slli a7, a6, 11 + xor xh, yh, a7 + mov xl, yl + leaf_return + +.Lsub_opposite_signs: + /* Operand signs differ. Do an addition. */ + slli a7, a6, 11 + xor yh, yh, a7 + j .Ladd_same_sign + + .align 4 + .global __subdf3 + .type __subdf3, @function +__subdf3: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + + /* Check if the two operands have the same sign. */ + xor a7, xh, yh + bltz a7, .Lsub_opposite_signs + +.Lsub_same_sign: + /* Check if either exponent == 0x7ff (i.e., NaN or Infinity). */ + ball xh, a6, .Lsub_xnan_or_inf + ball yh, a6, .Lsub_ynan_or_inf + + /* Compare the operands. In contrast to addition, the entire + value matters here. */ + extui a7, xh, 20, 11 + extui a8, yh, 20, 11 + bltu xh, yh, .Lsub_xsmaller + beq xh, yh, .Lsub_compare_low + +.Lsub_ysmaller: + /* Check if the smaller (or equal) exponent is zero. */ + bnone yh, a6, .Lsub_yexpzero + + /* Replace yh sign/exponent with 0x001. */ + or yh, yh, a6 + slli yh, yh, 11 + srli yh, yh, 11 + +.Lsub_yexpdiff: + /* Compute the exponent difference. Optimize for difference < 32. */ + sub a10, a7, a8 + bgeui a10, 32, .Lsub_bigshifty + + /* Shift yh/yl right by the exponent difference. Any bits that are + shifted out of yl are saved in a9 for rounding the result. */ + ssr a10 + movi a9, 0 + src a9, yl, a9 + src yl, yh, yl + srl yh, yh + +.Lsub_suby: + /* Do the 64-bit subtraction. */ + sub xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, -1 +1: sub xl, xl, yl + + /* Subtract the leftover bits in a9 from zero and propagate any + borrow from xh/xl. */ + neg a9, a9 + beqz a9, 1f + addi a5, xh, -1 + moveqz xh, a5, xl + addi xl, xl, -1 +1: + /* Check if the subtract underflowed into the exponent. */ + extui a10, xh, 20, 11 + beq a10, a7, .Lsub_round + j .Lsub_borrow + +.Lsub_compare_low: + /* The high words are equal. Compare the low words. */ + bltu xl, yl, .Lsub_xsmaller + bltu yl, xl, .Lsub_ysmaller + /* The operands are equal. Return 0.0. */ + movi xh, 0 + movi xl, 0 +1: leaf_return + +.Lsub_yexpzero: + /* y is a subnormal value. Replace its sign/exponent with zero, + i.e., no implicit "1.0". Unless x is also a subnormal, increment + y's apparent exponent because subnormals behave as if they had + the minimum (nonzero) exponent. */ + slli yh, yh, 12 + srli yh, yh, 12 + bnone xh, a6, .Lsub_yexpdiff + addi a8, a8, 1 + j .Lsub_yexpdiff + +.Lsub_bigshifty: + /* Exponent difference > 64 -- just return the bigger value. */ + bgeui a10, 64, 1b + + /* Shift yh/yl right by the exponent difference. Any bits that are + shifted out are saved in a9 for rounding the result. */ + ssr a10 + sll a11, yl /* lost bits shifted out of yl */ + src a9, yh, yl + srl yl, yh + movi yh, 0 + beqz a11, .Lsub_suby + or a9, a9, a10 /* any positive, nonzero value will work */ + j .Lsub_suby + +.Lsub_xsmaller: + /* Same thing as the "ysmaller" code, but with x and y swapped and + with y negated. */ + bnone xh, a6, .Lsub_xexpzero + + or xh, xh, a6 + slli xh, xh, 11 + srli xh, xh, 11 + +.Lsub_xexpdiff: + sub a10, a8, a7 + bgeui a10, 32, .Lsub_bigshiftx + + ssr a10 + movi a9, 0 + src a9, xl, a9 + src xl, xh, xl + srl xh, xh + + /* Negate y. */ + slli a11, a6, 11 + xor yh, yh, a11 + +.Lsub_subx: + sub xl, yl, xl + sub xh, yh, xh + bgeu yl, xl, 1f + addi xh, xh, -1 +1: + /* Subtract the leftover bits in a9 from zero and propagate any + borrow from xh/xl. */ + neg a9, a9 + beqz a9, 1f + addi a5, xh, -1 + moveqz xh, a5, xl + addi xl, xl, -1 +1: + /* Check if the subtract underflowed into the exponent. */ + extui a10, xh, 20, 11 + bne a10, a8, .Lsub_borrow + +.Lsub_round: + /* Round up if the leftover fraction is >= 1/2. */ + bgez a9, 1f + addi xl, xl, 1 + beqz xl, .Lsub_roundcarry + + /* Check if the leftover fraction is exactly 1/2. */ + slli a9, a9, 1 + beqz a9, .Lsub_exactlyhalf +1: leaf_return + +.Lsub_xexpzero: + /* Same as "yexpzero". */ + slli xh, xh, 12 + srli xh, xh, 12 + bnone yh, a6, .Lsub_xexpdiff + addi a7, a7, 1 + j .Lsub_xexpdiff + +.Lsub_bigshiftx: + /* Mostly the same thing as "bigshifty", but with the sign bit of the + shifted value set so that the subsequent subtraction flips the + sign of y. */ + bgeui a10, 64, .Lsub_returny + + ssr a10 + sll a11, xl + src a9, xh, xl + srl xl, xh + slli xh, a6, 11 /* set sign bit of xh */ + beqz a11, .Lsub_subx + or a9, a9, a10 + j .Lsub_subx + +.Lsub_returny: + /* Negate and return y. */ + slli a7, a6, 11 + xor xh, yh, a7 + mov xl, yl + leaf_return + +.Lsub_borrow: + /* The subtraction has underflowed into the exponent field, so the + value needs to be renormalized. Shift the mantissa left as + needed to remove any leading zeros and adjust the exponent + accordingly. If the exponent is not large enough to remove + all the leading zeros, the result will be a subnormal value. */ + + slli a8, xh, 12 + beqz a8, .Lsub_xhzero + do_nsau a6, a8, a7, a11 + srli a8, a8, 12 + bge a6, a10, .Lsub_subnormal + addi a6, a6, 1 + +.Lsub_shift_lt32: + /* Shift the mantissa (a8/xl/a9) left by a6. */ + ssl a6 + src a8, a8, xl + src xl, xl, a9 + sll a9, a9 + + /* Combine the shifted mantissa with the sign and exponent, + decrementing the exponent by a6. (The exponent has already + been decremented by one due to the borrow from the subtraction, + but adding the mantissa will increment the exponent by one.) */ + srli xh, xh, 20 + sub xh, xh, a6 + slli xh, xh, 20 + add xh, xh, a8 + j .Lsub_round + +.Lsub_exactlyhalf: + /* Round down to the nearest even value. */ + srli xl, xl, 1 + slli xl, xl, 1 + leaf_return + +.Lsub_roundcarry: + /* xl is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi xh, xh, 1 + /* Overflow to the exponent is OK. */ + leaf_return + +.Lsub_xhzero: + /* When normalizing the result, all the mantissa bits in the high + word are zero. Shift by "20 + (leading zero count of xl) + 1". */ + do_nsau a6, xl, a7, a11 + addi a6, a6, 21 + blt a10, a6, .Lsub_subnormal + +.Lsub_normalize_shift: + bltui a6, 32, .Lsub_shift_lt32 + + ssl a6 + src a8, xl, a9 + sll xl, a9 + movi a9, 0 + + srli xh, xh, 20 + sub xh, xh, a6 + slli xh, xh, 20 + add xh, xh, a8 + j .Lsub_round + +.Lsub_subnormal: + /* The exponent is too small to shift away all the leading zeros. + Set a6 to the current exponent (which has already been + decremented by the borrow) so that the exponent of the result + will be zero. Do not add 1 to a6 in this case, because: (1) + adding the mantissa will not increment the exponent, so there is + no need to subtract anything extra from the exponent to + compensate, and (2) the effective exponent of a subnormal is 1 + not 0 so the shift amount must be 1 smaller than normal. */ + mov a6, a10 + j .Lsub_normalize_shift + +#endif /* L_addsubdf3 */ + +#ifdef L_muldf3 + + /* Multiplication */ +#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 +#define XCHAL_NO_MUL 1 +#endif + +__muldf3_aux: + + /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). + (This code is placed before the start of the function just to + keep it in range of the limited branch displacements.) */ + +.Lmul_xexpzero: + /* Clear the sign bit of x. */ + slli xh, xh, 1 + srli xh, xh, 1 + + /* If x is zero, return zero. */ + or a10, xh, xl + beqz a10, .Lmul_return_zero + + /* Normalize x. Adjust the exponent in a8. */ + beqz xh, .Lmul_xh_zero + do_nsau a10, xh, a11, a12 + addi a10, a10, -11 + ssl a10 + src xh, xh, xl + sll xl, xl + movi a8, 1 + sub a8, a8, a10 + j .Lmul_xnormalized +.Lmul_xh_zero: + do_nsau a10, xl, a11, a12 + addi a10, a10, -11 + movi a8, -31 + sub a8, a8, a10 + ssl a10 + bltz a10, .Lmul_xl_srl + sll xh, xl + movi xl, 0 + j .Lmul_xnormalized +.Lmul_xl_srl: + srl xh, xl + sll xl, xl + j .Lmul_xnormalized + +.Lmul_yexpzero: + /* Clear the sign bit of y. */ + slli yh, yh, 1 + srli yh, yh, 1 + + /* If y is zero, return zero. */ + or a10, yh, yl + beqz a10, .Lmul_return_zero + + /* Normalize y. Adjust the exponent in a9. */ + beqz yh, .Lmul_yh_zero + do_nsau a10, yh, a11, a12 + addi a10, a10, -11 + ssl a10 + src yh, yh, yl + sll yl, yl + movi a9, 1 + sub a9, a9, a10 + j .Lmul_ynormalized +.Lmul_yh_zero: + do_nsau a10, yl, a11, a12 + addi a10, a10, -11 + movi a9, -31 + sub a9, a9, a10 + ssl a10 + bltz a10, .Lmul_yl_srl + sll yh, yl + movi yl, 0 + j .Lmul_ynormalized +.Lmul_yl_srl: + srl yh, yl + sll yl, yl + j .Lmul_ynormalized + +.Lmul_return_zero: + /* Return zero with the appropriate sign bit. */ + srli xh, a7, 31 + slli xh, xh, 31 + movi xl, 0 + j .Lmul_done + +.Lmul_xnan_or_inf: + /* If y is zero, return NaN. */ + bnez yl, 1f + slli a8, yh, 1 + bnez a8, 1f + movi a4, 0x80000 /* make it a quiet NaN */ + or xh, xh, a4 + j .Lmul_done +1: + /* If y is NaN, return y. */ + bnall yh, a6, .Lmul_returnx + slli a8, yh, 12 + or a8, a8, yl + beqz a8, .Lmul_returnx + +.Lmul_returny: + mov xh, yh + mov xl, yl + +.Lmul_returnx: + /* Set the sign bit and return. */ + extui a7, a7, 31, 1 + slli xh, xh, 1 + ssai 1 + src xh, a7, xh + j .Lmul_done + +.Lmul_ynan_or_inf: + /* If x is zero, return NaN. */ + bnez xl, .Lmul_returny + slli a8, xh, 1 + bnez a8, .Lmul_returny + movi a7, 0x80000 /* make it a quiet NaN */ + or xh, yh, a7 + j .Lmul_done + + .align 4 + .global __muldf3 + .type __muldf3, @function +__muldf3: +#if __XTENSA_CALL0_ABI__ + leaf_entry sp, 32 + addi sp, sp, -32 + s32i a12, sp, 16 + s32i a13, sp, 20 + s32i a14, sp, 24 + s32i a15, sp, 28 +#elif XCHAL_NO_MUL + /* This is not really a leaf function; allocate enough stack space + to allow CALL12s to a helper function. */ + leaf_entry sp, 64 +#else + leaf_entry sp, 32 +#endif + movi a6, 0x7ff00000 + + /* Get the sign of the result. */ + xor a7, xh, yh + + /* Check for NaN and infinity. */ + ball xh, a6, .Lmul_xnan_or_inf + ball yh, a6, .Lmul_ynan_or_inf + + /* Extract the exponents. */ + extui a8, xh, 20, 11 + extui a9, yh, 20, 11 + + beqz a8, .Lmul_xexpzero +.Lmul_xnormalized: + beqz a9, .Lmul_yexpzero +.Lmul_ynormalized: + + /* Add the exponents. */ + add a8, a8, a9 + + /* Replace sign/exponent fields with explicit "1.0". */ + movi a10, 0x1fffff + or xh, xh, a6 + and xh, xh, a10 + or yh, yh, a6 + and yh, yh, a10 + + /* Multiply 64x64 to 128 bits. The result ends up in xh/xl/a6. + The least-significant word of the result is thrown away except + that if it is nonzero, the lsb of a6 is set to 1. */ +#if XCHAL_HAVE_MUL32_HIGH + + /* Compute a6 with any carry-outs in a10. */ + movi a10, 0 + mull a6, xl, yh + mull a11, xh, yl + add a6, a6, a11 + bgeu a6, a11, 1f + addi a10, a10, 1 +1: + muluh a11, xl, yl + add a6, a6, a11 + bgeu a6, a11, 1f + addi a10, a10, 1 +1: + /* If the low word of the result is nonzero, set the lsb of a6. */ + mull a11, xl, yl + beqz a11, 1f + movi a9, 1 + or a6, a6, a9 +1: + /* Compute xl with any carry-outs in a9. */ + movi a9, 0 + mull a11, xh, yh + add a10, a10, a11 + bgeu a10, a11, 1f + addi a9, a9, 1 +1: + muluh a11, xh, yl + add a10, a10, a11 + bgeu a10, a11, 1f + addi a9, a9, 1 +1: + muluh xl, xl, yh + add xl, xl, a10 + bgeu xl, a10, 1f + addi a9, a9, 1 +1: + /* Compute xh. */ + muluh xh, xh, yh + add xh, xh, a9 + +#else /* ! XCHAL_HAVE_MUL32_HIGH */ + + /* Break the inputs into 16-bit chunks and compute 16 32-bit partial + products. These partial products are: + + 0 xll * yll + + 1 xll * ylh + 2 xlh * yll + + 3 xll * yhl + 4 xlh * ylh + 5 xhl * yll + + 6 xll * yhh + 7 xlh * yhl + 8 xhl * ylh + 9 xhh * yll + + 10 xlh * yhh + 11 xhl * yhl + 12 xhh * ylh + + 13 xhl * yhh + 14 xhh * yhl + + 15 xhh * yhh + + where the input chunks are (hh, hl, lh, ll). If using the Mul16 + or Mul32 multiplier options, these input chunks must be stored in + separate registers. For Mac16, the UMUL.AA.* opcodes can specify + that the inputs come from either half of the registers, so there + is no need to shift them out ahead of time. If there is no + multiply hardware, the 16-bit chunks can be extracted when setting + up the arguments to the separate multiply function. */ + + /* Save a7 since it is needed to hold a temporary value. */ + s32i a7, sp, 4 +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* Calling a separate multiply function will clobber a0 and requires + use of a8 as a temporary, so save those values now. (The function + uses a custom ABI so nothing else needs to be saved.) */ + s32i a0, sp, 0 + s32i a8, sp, 8 +#endif + +#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 + +#define xlh a12 +#define ylh a13 +#define xhh a14 +#define yhh a15 + + /* Get the high halves of the inputs into registers. */ + srli xlh, xl, 16 + srli ylh, yl, 16 + srli xhh, xh, 16 + srli yhh, yh, 16 + +#define xll xl +#define yll yl +#define xhl xh +#define yhl yh + +#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 + /* Clear the high halves of the inputs. This does not matter + for MUL16 because the high bits are ignored. */ + extui xl, xl, 0, 16 + extui xh, xh, 0, 16 + extui yl, yl, 0, 16 + extui yh, yh, 0, 16 +#endif +#endif /* MUL16 || MUL32 */ + + +#if XCHAL_HAVE_MUL16 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mul16u dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MUL32 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mull dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MAC16 + +/* The preprocessor insists on inserting a space when concatenating after + a period in the definition of do_mul below. These macros are a workaround + using underscores instead of periods when doing the concatenation. */ +#define umul_aa_ll umul.aa.ll +#define umul_aa_lh umul.aa.lh +#define umul_aa_hl umul.aa.hl +#define umul_aa_hh umul.aa.hh + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + umul_aa_ ## xhalf ## yhalf xreg, yreg; \ + rsr dst, ACCLO + +#else /* no multiply hardware */ + +#define set_arg_l(dst, src) \ + extui dst, src, 0, 16 +#define set_arg_h(dst, src) \ + srli dst, src, 16 + +#if __XTENSA_CALL0_ABI__ +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a13, xreg); \ + set_arg_ ## yhalf (a14, yreg); \ + call0 .Lmul_mulsi3; \ + mov dst, a12 +#else +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a14, xreg); \ + set_arg_ ## yhalf (a15, yreg); \ + call12 .Lmul_mulsi3; \ + mov dst, a14 +#endif /* __XTENSA_CALL0_ABI__ */ + +#endif /* no multiply hardware */ + + /* Add pp1 and pp2 into a10 with carry-out in a9. */ + do_mul(a10, xl, l, yl, h) /* pp 1 */ + do_mul(a11, xl, h, yl, l) /* pp 2 */ + movi a9, 0 + add a10, a10, a11 + bgeu a10, a11, 1f + addi a9, a9, 1 +1: + /* Initialize a6 with a9/a10 shifted into position. Note that + this value can be safely incremented without any carry-outs. */ + ssai 16 + src a6, a9, a10 + + /* Compute the low word into a10. */ + do_mul(a11, xl, l, yl, l) /* pp 0 */ + sll a10, a10 + add a10, a10, a11 + bgeu a10, a11, 1f + addi a6, a6, 1 +1: + /* Compute the contributions of pp0-5 to a6, with carry-outs in a9. + This is good enough to determine the low half of a6, so that any + nonzero bits from the low word of the result can be collapsed + into a6, freeing up a register. */ + movi a9, 0 + do_mul(a11, xl, l, yh, l) /* pp 3 */ + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + do_mul(a11, xl, h, yl, h) /* pp 4 */ + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + do_mul(a11, xh, l, yl, l) /* pp 5 */ + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Collapse any nonzero bits from the low word into a6. */ + beqz a10, 1f + movi a11, 1 + or a6, a6, a11 +1: + /* Add pp6-9 into a11 with carry-outs in a10. */ + do_mul(a7, xl, l, yh, h) /* pp 6 */ + do_mul(a11, xh, h, yl, l) /* pp 9 */ + movi a10, 0 + add a11, a11, a7 + bgeu a11, a7, 1f + addi a10, a10, 1 +1: + do_mul(a7, xl, h, yh, l) /* pp 7 */ + add a11, a11, a7 + bgeu a11, a7, 1f + addi a10, a10, 1 +1: + do_mul(a7, xh, l, yl, h) /* pp 8 */ + add a11, a11, a7 + bgeu a11, a7, 1f + addi a10, a10, 1 +1: + /* Shift a10/a11 into position, and add low half of a11 to a6. */ + src a10, a10, a11 + add a10, a10, a9 + sll a11, a11 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a10, a10, 1 +1: + /* Add pp10-12 into xl with carry-outs in a9. */ + movi a9, 0 + do_mul(xl, xl, h, yh, h) /* pp 10 */ + add xl, xl, a10 + bgeu xl, a10, 1f + addi a9, a9, 1 +1: + do_mul(a10, xh, l, yh, l) /* pp 11 */ + add xl, xl, a10 + bgeu xl, a10, 1f + addi a9, a9, 1 +1: + do_mul(a10, xh, h, yl, h) /* pp 12 */ + add xl, xl, a10 + bgeu xl, a10, 1f + addi a9, a9, 1 +1: + /* Add pp13-14 into a11 with carry-outs in a10. */ + do_mul(a11, xh, l, yh, h) /* pp 13 */ + do_mul(a7, xh, h, yh, l) /* pp 14 */ + movi a10, 0 + add a11, a11, a7 + bgeu a11, a7, 1f + addi a10, a10, 1 +1: + /* Shift a10/a11 into position, and add low half of a11 to a6. */ + src a10, a10, a11 + add a10, a10, a9 + sll a11, a11 + add xl, xl, a11 + bgeu xl, a11, 1f + addi a10, a10, 1 +1: + /* Compute xh. */ + do_mul(xh, xh, h, yh, h) /* pp 15 */ + add xh, xh, a10 + + /* Restore values saved on the stack during the multiplication. */ + l32i a7, sp, 4 +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + l32i a0, sp, 0 + l32i a8, sp, 8 +#endif +#endif /* ! XCHAL_HAVE_MUL32_HIGH */ + + /* Shift left by 12 bits, unless there was a carry-out from the + multiply, in which case, shift by 11 bits and increment the + exponent. Note: It is convenient to use the constant 0x3ff + instead of 0x400 when removing the extra exponent bias (so that + it is easy to construct 0x7fe for the overflow check). Reverse + the logic here to decrement the exponent sum by one unless there + was a carry-out. */ + movi a4, 11 + srli a5, xh, 21 - 12 + bnez a5, 1f + addi a4, a4, 1 + addi a8, a8, -1 +1: ssl a4 + src xh, xh, xl + src xl, xl, a6 + sll a6, a6 + + /* Subtract the extra bias from the exponent sum (plus one to account + for the explicit "1.0" of the mantissa that will be added to the + exponent in the final result). */ + movi a4, 0x3ff + sub a8, a8, a4 + + /* Check for over/underflow. The value in a8 is one less than the + final exponent, so values in the range 0..7fd are OK here. */ + slli a4, a4, 1 /* 0x7fe */ + bgeu a8, a4, .Lmul_overflow + +.Lmul_round: + /* Round. */ + bgez a6, .Lmul_rounded + addi xl, xl, 1 + beqz xl, .Lmul_roundcarry + slli a6, a6, 1 + beqz a6, .Lmul_exactlyhalf + +.Lmul_rounded: + /* Add the exponent to the mantissa. */ + slli a8, a8, 20 + add xh, xh, a8 + +.Lmul_addsign: + /* Add the sign bit. */ + srli a7, a7, 31 + slli a7, a7, 31 + or xh, xh, a7 + +.Lmul_done: +#if __XTENSA_CALL0_ABI__ + l32i a12, sp, 16 + l32i a13, sp, 20 + l32i a14, sp, 24 + l32i a15, sp, 28 + addi sp, sp, 32 +#endif + leaf_return + +.Lmul_exactlyhalf: + /* Round down to the nearest even value. */ + srli xl, xl, 1 + slli xl, xl, 1 + j .Lmul_rounded + +.Lmul_roundcarry: + /* xl is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi xh, xh, 1 + /* Overflow is OK -- it will be added to the exponent. */ + j .Lmul_rounded + +.Lmul_overflow: + bltz a8, .Lmul_underflow + /* Return +/- Infinity. */ + addi a8, a4, 1 /* 0x7ff */ + slli xh, a8, 20 + movi xl, 0 + j .Lmul_addsign + +.Lmul_underflow: + /* Create a subnormal value, where the exponent field contains zero, + but the effective exponent is 1. The value of a8 is one less than + the actual exponent, so just negate it to get the shift amount. */ + neg a8, a8 + mov a9, a6 + ssr a8 + bgeui a8, 32, .Lmul_bigshift + + /* Shift xh/xl right. Any bits that are shifted out of xl are saved + in a6 (combined with the shifted-out bits currently in a6) for + rounding the result. */ + sll a6, xl + src xl, xh, xl + srl xh, xh + j 1f + +.Lmul_bigshift: + bgeui a8, 64, .Lmul_flush_to_zero + sll a10, xl /* lost bits shifted out of xl */ + src a6, xh, xl + srl xl, xh + movi xh, 0 + or a9, a9, a10 + + /* Set the exponent to zero. */ +1: movi a8, 0 + + /* Pack any nonzero bits shifted out into a6. */ + beqz a9, .Lmul_round + movi a9, 1 + or a6, a6, a9 + j .Lmul_round + +.Lmul_flush_to_zero: + /* Return zero with the appropriate sign bit. */ + srli xh, a7, 31 + slli xh, xh, 31 + movi xl, 0 + j .Lmul_done + +#if XCHAL_NO_MUL + + /* For Xtensa processors with no multiply hardware, this simplified + version of _mulsi3 is used for multiplying 16-bit chunks of + the floating-point mantissas. When using CALL0, this function + uses a custom ABI: the inputs are passed in a13 and a14, the + result is returned in a12, and a8 and a15 are clobbered. */ + .align 4 +.Lmul_mulsi3: + leaf_entry sp, 16 + .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 + movi \dst, 0 +1: add \tmp1, \src2, \dst + extui \tmp2, \src1, 0, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx2 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 1, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx4 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 2, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx8 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 3, 1 + movnez \dst, \tmp1, \tmp2 + + srli \src1, \src1, 4 + slli \src2, \src2, 4 + bnez \src1, 1b + .endm +#if __XTENSA_CALL0_ABI__ + mul_mulsi3_body a12, a13, a14, a15, a8 +#else + /* The result will be written into a2, so save that argument in a4. */ + mov a4, a2 + mul_mulsi3_body a2, a4, a3, a5, a6 +#endif + leaf_return +#endif /* XCHAL_NO_MUL */ +#endif /* L_muldf3 */ + +#ifdef L_divdf3 + + /* Division */ +__divdf3_aux: + + /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). + (This code is placed before the start of the function just to + keep it in range of the limited branch displacements.) */ + +.Ldiv_yexpzero: + /* Clear the sign bit of y. */ + slli yh, yh, 1 + srli yh, yh, 1 + + /* Check for division by zero. */ + or a10, yh, yl + beqz a10, .Ldiv_yzero + + /* Normalize y. Adjust the exponent in a9. */ + beqz yh, .Ldiv_yh_zero + do_nsau a10, yh, a11, a9 + addi a10, a10, -11 + ssl a10 + src yh, yh, yl + sll yl, yl + movi a9, 1 + sub a9, a9, a10 + j .Ldiv_ynormalized +.Ldiv_yh_zero: + do_nsau a10, yl, a11, a9 + addi a10, a10, -11 + movi a9, -31 + sub a9, a9, a10 + ssl a10 + bltz a10, .Ldiv_yl_srl + sll yh, yl + movi yl, 0 + j .Ldiv_ynormalized +.Ldiv_yl_srl: + srl yh, yl + sll yl, yl + j .Ldiv_ynormalized + +.Ldiv_yzero: + /* y is zero. Return NaN if x is also zero; otherwise, infinity. */ + slli xh, xh, 1 + srli xh, xh, 1 + or xl, xl, xh + srli xh, a7, 31 + slli xh, xh, 31 + or xh, xh, a6 + bnez xl, 1f + movi a4, 0x80000 /* make it a quiet NaN */ + or xh, xh, a4 +1: movi xl, 0 + leaf_return + +.Ldiv_xexpzero: + /* Clear the sign bit of x. */ + slli xh, xh, 1 + srli xh, xh, 1 + + /* If x is zero, return zero. */ + or a10, xh, xl + beqz a10, .Ldiv_return_zero + + /* Normalize x. Adjust the exponent in a8. */ + beqz xh, .Ldiv_xh_zero + do_nsau a10, xh, a11, a8 + addi a10, a10, -11 + ssl a10 + src xh, xh, xl + sll xl, xl + movi a8, 1 + sub a8, a8, a10 + j .Ldiv_xnormalized +.Ldiv_xh_zero: + do_nsau a10, xl, a11, a8 + addi a10, a10, -11 + movi a8, -31 + sub a8, a8, a10 + ssl a10 + bltz a10, .Ldiv_xl_srl + sll xh, xl + movi xl, 0 + j .Ldiv_xnormalized +.Ldiv_xl_srl: + srl xh, xl + sll xl, xl + j .Ldiv_xnormalized + +.Ldiv_return_zero: + /* Return zero with the appropriate sign bit. */ + srli xh, a7, 31 + slli xh, xh, 31 + movi xl, 0 + leaf_return + +.Ldiv_xnan_or_inf: + /* Set the sign bit of the result. */ + srli a7, yh, 31 + slli a7, a7, 31 + xor xh, xh, a7 + /* If y is NaN or Inf, return NaN. */ + bnall yh, a6, 1f + movi a4, 0x80000 /* make it a quiet NaN */ + or xh, xh, a4 +1: leaf_return + +.Ldiv_ynan_or_inf: + /* If y is Infinity, return zero. */ + slli a8, yh, 12 + or a8, a8, yl + beqz a8, .Ldiv_return_zero + /* y is NaN; return it. */ + mov xh, yh + mov xl, yl + leaf_return + +.Ldiv_highequal1: + bltu xl, yl, 2f + j 3f + + .align 4 + .global __divdf3 + .type __divdf3, @function +__divdf3: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + + /* Get the sign of the result. */ + xor a7, xh, yh + + /* Check for NaN and infinity. */ + ball xh, a6, .Ldiv_xnan_or_inf + ball yh, a6, .Ldiv_ynan_or_inf + + /* Extract the exponents. */ + extui a8, xh, 20, 11 + extui a9, yh, 20, 11 + + beqz a9, .Ldiv_yexpzero +.Ldiv_ynormalized: + beqz a8, .Ldiv_xexpzero +.Ldiv_xnormalized: + + /* Subtract the exponents. */ + sub a8, a8, a9 + + /* Replace sign/exponent fields with explicit "1.0". */ + movi a10, 0x1fffff + or xh, xh, a6 + and xh, xh, a10 + or yh, yh, a6 + and yh, yh, a10 + + /* Set SAR for left shift by one. */ + ssai (32 - 1) + + /* The first digit of the mantissa division must be a one. + Shift x (and adjust the exponent) as needed to make this true. */ + bltu yh, xh, 3f + beq yh, xh, .Ldiv_highequal1 +2: src xh, xh, xl + sll xl, xl + addi a8, a8, -1 +3: + /* Do the first subtraction and shift. */ + sub xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, -1 +1: sub xl, xl, yl + src xh, xh, xl + sll xl, xl + + /* Put the quotient into a10/a11. */ + movi a10, 0 + movi a11, 1 + + /* Divide one bit at a time for 52 bits. */ + movi a9, 52 +#if XCHAL_HAVE_LOOPS + loop a9, .Ldiv_loopend +#endif +.Ldiv_loop: + /* Shift the quotient << 1. */ + src a10, a10, a11 + sll a11, a11 + + /* Is this digit a 0 or 1? */ + bltu xh, yh, 3f + beq xh, yh, .Ldiv_highequal2 + + /* Output a 1 and subtract. */ +2: addi a11, a11, 1 + sub xh, xh, yh + bgeu xl, yl, 1f + addi xh, xh, -1 +1: sub xl, xl, yl + + /* Shift the dividend << 1. */ +3: src xh, xh, xl + sll xl, xl + +#if !XCHAL_HAVE_LOOPS + addi a9, a9, -1 + bnez a9, .Ldiv_loop +#endif +.Ldiv_loopend: + + /* Add the exponent bias (less one to account for the explicit "1.0" + of the mantissa that will be added to the exponent in the final + result). */ + movi a9, 0x3fe + add a8, a8, a9 + + /* Check for over/underflow. The value in a8 is one less than the + final exponent, so values in the range 0..7fd are OK here. */ + addmi a9, a9, 0x400 /* 0x7fe */ + bgeu a8, a9, .Ldiv_overflow + +.Ldiv_round: + /* Round. The remainder (<< 1) is in xh/xl. */ + bltu xh, yh, .Ldiv_rounded + beq xh, yh, .Ldiv_highequal3 +.Ldiv_roundup: + addi a11, a11, 1 + beqz a11, .Ldiv_roundcarry + +.Ldiv_rounded: + mov xl, a11 + /* Add the exponent to the mantissa. */ + slli a8, a8, 20 + add xh, a10, a8 + +.Ldiv_addsign: + /* Add the sign bit. */ + srli a7, a7, 31 + slli a7, a7, 31 + or xh, xh, a7 + leaf_return + +.Ldiv_highequal2: + bgeu xl, yl, 2b + j 3b + +.Ldiv_highequal3: + bltu xl, yl, .Ldiv_rounded + bne xl, yl, .Ldiv_roundup + + /* Remainder is exactly half the divisor. Round even. */ + addi a11, a11, 1 + beqz a11, .Ldiv_roundcarry + srli a11, a11, 1 + slli a11, a11, 1 + j .Ldiv_rounded + +.Ldiv_overflow: + bltz a8, .Ldiv_underflow + /* Return +/- Infinity. */ + addi a8, a9, 1 /* 0x7ff */ + slli xh, a8, 20 + movi xl, 0 + j .Ldiv_addsign + +.Ldiv_underflow: + /* Create a subnormal value, where the exponent field contains zero, + but the effective exponent is 1. The value of a8 is one less than + the actual exponent, so just negate it to get the shift amount. */ + neg a8, a8 + ssr a8 + bgeui a8, 32, .Ldiv_bigshift + + /* Shift a10/a11 right. Any bits that are shifted out of a11 are + saved in a6 for rounding the result. */ + sll a6, a11 + src a11, a10, a11 + srl a10, a10 + j 1f + +.Ldiv_bigshift: + bgeui a8, 64, .Ldiv_flush_to_zero + sll a9, a11 /* lost bits shifted out of a11 */ + src a6, a10, a11 + srl a11, a10 + movi a10, 0 + or xl, xl, a9 + + /* Set the exponent to zero. */ +1: movi a8, 0 + + /* Pack any nonzero remainder (in xh/xl) into a6. */ + or xh, xh, xl + beqz xh, 1f + movi a9, 1 + or a6, a6, a9 + + /* Round a10/a11 based on the bits shifted out into a6. */ +1: bgez a6, .Ldiv_rounded + addi a11, a11, 1 + beqz a11, .Ldiv_roundcarry + slli a6, a6, 1 + bnez a6, .Ldiv_rounded + srli a11, a11, 1 + slli a11, a11, 1 + j .Ldiv_rounded + +.Ldiv_roundcarry: + /* a11 is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi a10, a10, 1 + /* Overflow to the exponent field is OK. */ + j .Ldiv_rounded + +.Ldiv_flush_to_zero: + /* Return zero with the appropriate sign bit. */ + srli xh, a7, 31 + slli xh, xh, 31 + movi xl, 0 + leaf_return + +#endif /* L_divdf3 */ + +#ifdef L_cmpdf2 + + /* Equal and Not Equal */ + + .align 4 + .global __eqdf2 + .global __nedf2 + .set __nedf2, __eqdf2 + .type __eqdf2, @function +__eqdf2: + leaf_entry sp, 16 + bne xl, yl, 2f + bne xh, yh, 4f + + /* The values are equal but NaN != NaN. Check the exponent. */ + movi a6, 0x7ff00000 + ball xh, a6, 3f + + /* Equal. */ + movi a2, 0 + leaf_return + + /* Not equal. */ +2: movi a2, 1 + leaf_return + + /* Check if the mantissas are nonzero. */ +3: slli a7, xh, 12 + or a7, a7, xl + j 5f + + /* Check if x and y are zero with different signs. */ +4: or a7, xh, yh + slli a7, a7, 1 + or a7, a7, xl /* xl == yl here */ + + /* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa + or x when exponent(x) = 0x7ff and x == y. */ +5: movi a2, 0 + movi a3, 1 + movnez a2, a3, a7 + leaf_return + + + /* Greater Than */ + + .align 4 + .global __gtdf2 + .type __gtdf2, @function +__gtdf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 2f +1: bnall yh, a6, .Lle_cmp + + /* Check if y is a NaN. */ + slli a7, yh, 12 + or a7, a7, yl + beqz a7, .Lle_cmp + movi a2, 0 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, 0 + leaf_return + + + /* Less Than or Equal */ + + .align 4 + .global __ledf2 + .type __ledf2, @function +__ledf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 2f +1: bnall yh, a6, .Lle_cmp + + /* Check if y is a NaN. */ + slli a7, yh, 12 + or a7, a7, yl + beqz a7, .Lle_cmp + movi a2, 1 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, 1 + leaf_return + +.Lle_cmp: + /* Check if x and y have different signs. */ + xor a7, xh, yh + bltz a7, .Lle_diff_signs + + /* Check if x is negative. */ + bltz xh, .Lle_xneg + + /* Check if x <= y. */ + bltu xh, yh, 4f + bne xh, yh, 5f + bltu yl, xl, 5f +4: movi a2, 0 + leaf_return + +.Lle_xneg: + /* Check if y <= x. */ + bltu yh, xh, 4b + bne yh, xh, 5f + bgeu xl, yl, 4b +5: movi a2, 1 + leaf_return + +.Lle_diff_signs: + bltz xh, 4b + + /* Check if both x and y are zero. */ + or a7, xh, yh + slli a7, a7, 1 + or a7, a7, xl + or a7, a7, yl + movi a2, 1 + movi a3, 0 + moveqz a2, a3, a7 + leaf_return + + + /* Greater Than or Equal */ + + .align 4 + .global __gedf2 + .type __gedf2, @function +__gedf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 2f +1: bnall yh, a6, .Llt_cmp + + /* Check if y is a NaN. */ + slli a7, yh, 12 + or a7, a7, yl + beqz a7, .Llt_cmp + movi a2, -1 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, -1 + leaf_return + + + /* Less Than */ + + .align 4 + .global __ltdf2 + .type __ltdf2, @function +__ltdf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 2f +1: bnall yh, a6, .Llt_cmp + + /* Check if y is a NaN. */ + slli a7, yh, 12 + or a7, a7, yl + beqz a7, .Llt_cmp + movi a2, 0 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, 0 + leaf_return + +.Llt_cmp: + /* Check if x and y have different signs. */ + xor a7, xh, yh + bltz a7, .Llt_diff_signs + + /* Check if x is negative. */ + bltz xh, .Llt_xneg + + /* Check if x < y. */ + bltu xh, yh, 4f + bne xh, yh, 5f + bgeu xl, yl, 5f +4: movi a2, -1 + leaf_return + +.Llt_xneg: + /* Check if y < x. */ + bltu yh, xh, 4b + bne yh, xh, 5f + bltu yl, xl, 4b +5: movi a2, 0 + leaf_return + +.Llt_diff_signs: + bgez xh, 5b + + /* Check if both x and y are nonzero. */ + or a7, xh, yh + slli a7, a7, 1 + or a7, a7, xl + or a7, a7, yl + movi a2, 0 + movi a3, -1 + movnez a2, a3, a7 + leaf_return + + + /* Unordered */ + + .align 4 + .global __unorddf2 + .type __unorddf2, @function +__unorddf2: + leaf_entry sp, 16 + movi a6, 0x7ff00000 + ball xh, a6, 3f +1: ball yh, a6, 4f +2: movi a2, 0 + leaf_return + +3: slli a7, xh, 12 + or a7, a7, xl + beqz a7, 1b + movi a2, 1 + leaf_return + +4: slli a7, yh, 12 + or a7, a7, yl + beqz a7, 2b + movi a2, 1 + leaf_return + +#endif /* L_cmpdf2 */ + +#ifdef L_fixdfsi + + .align 4 + .global __fixdfsi + .type __fixdfsi, @function +__fixdfsi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7ff00000 + ball xh, a6, .Lfixdfsi_nan_or_inf + + /* Extract the exponent and check if 0 < (exp - 0x3fe) < 32. */ + extui a4, xh, 20, 11 + extui a5, a6, 19, 10 /* 0x3fe */ + sub a4, a4, a5 + bgei a4, 32, .Lfixdfsi_maxint + blti a4, 1, .Lfixdfsi_zero + + /* Add explicit "1.0" and shift << 11. */ + or a7, xh, a6 + ssai (32 - 11) + src a5, a7, xl + + /* Shift back to the right, based on the exponent. */ + ssl a4 /* shift by 32 - a4 */ + srl a5, a5 + + /* Negate the result if sign != 0. */ + neg a2, a5 + movgez a2, a5, a7 + leaf_return + +.Lfixdfsi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, xh, 12 + or a4, a4, xl + beqz a4, .Lfixdfsi_maxint + + /* Translate NaN to +maxint. */ + movi xh, 0 + +.Lfixdfsi_maxint: + slli a4, a6, 11 /* 0x80000000 */ + addi a5, a4, -1 /* 0x7fffffff */ + movgez a4, a5, xh + mov a2, a4 + leaf_return + +.Lfixdfsi_zero: + movi a2, 0 + leaf_return + +#endif /* L_fixdfsi */ + +#ifdef L_fixdfdi + + .align 4 + .global __fixdfdi + .type __fixdfdi, @function +__fixdfdi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7ff00000 + ball xh, a6, .Lfixdfdi_nan_or_inf + + /* Extract the exponent and check if 0 < (exp - 0x3fe) < 64. */ + extui a4, xh, 20, 11 + extui a5, a6, 19, 10 /* 0x3fe */ + sub a4, a4, a5 + bgei a4, 64, .Lfixdfdi_maxint + blti a4, 1, .Lfixdfdi_zero + + /* Add explicit "1.0" and shift << 11. */ + or a7, xh, a6 + ssai (32 - 11) + src xh, a7, xl + sll xl, xl + + /* Shift back to the right, based on the exponent. */ + ssl a4 /* shift by 64 - a4 */ + bgei a4, 32, .Lfixdfdi_smallshift + srl xl, xh + movi xh, 0 + +.Lfixdfdi_shifted: + /* Negate the result if sign != 0. */ + bgez a7, 1f + neg xl, xl + neg xh, xh + beqz xl, 1f + addi xh, xh, -1 +1: leaf_return + +.Lfixdfdi_smallshift: + src xl, xh, xl + srl xh, xh + j .Lfixdfdi_shifted + +.Lfixdfdi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, xh, 12 + or a4, a4, xl + beqz a4, .Lfixdfdi_maxint + + /* Translate NaN to +maxint. */ + movi xh, 0 + +.Lfixdfdi_maxint: + slli a7, a6, 11 /* 0x80000000 */ + bgez xh, 1f + mov xh, a7 + movi xl, 0 + leaf_return + +1: addi xh, a7, -1 /* 0x7fffffff */ + movi xl, -1 + leaf_return + +.Lfixdfdi_zero: + movi xh, 0 + movi xl, 0 + leaf_return + +#endif /* L_fixdfdi */ + +#ifdef L_fixunsdfsi + + .align 4 + .global __fixunsdfsi + .type __fixunsdfsi, @function +__fixunsdfsi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7ff00000 + ball xh, a6, .Lfixunsdfsi_nan_or_inf + + /* Extract the exponent and check if 0 <= (exp - 0x3ff) < 32. */ + extui a4, xh, 20, 11 + extui a5, a6, 20, 10 /* 0x3ff */ + sub a4, a4, a5 + bgei a4, 32, .Lfixunsdfsi_maxint + bltz a4, .Lfixunsdfsi_zero + + /* Add explicit "1.0" and shift << 11. */ + or a7, xh, a6 + ssai (32 - 11) + src a5, a7, xl + + /* Shift back to the right, based on the exponent. */ + addi a4, a4, 1 + beqi a4, 32, .Lfixunsdfsi_bigexp + ssl a4 /* shift by 32 - a4 */ + srl a5, a5 + + /* Negate the result if sign != 0. */ + neg a2, a5 + movgez a2, a5, a7 + leaf_return + +.Lfixunsdfsi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, xh, 12 + or a4, a4, xl + beqz a4, .Lfixunsdfsi_maxint + + /* Translate NaN to 0xffffffff. */ + movi a2, -1 + leaf_return + +.Lfixunsdfsi_maxint: + slli a4, a6, 11 /* 0x80000000 */ + movi a5, -1 /* 0xffffffff */ + movgez a4, a5, xh + mov a2, a4 + leaf_return + +.Lfixunsdfsi_zero: + movi a2, 0 + leaf_return + +.Lfixunsdfsi_bigexp: + /* Handle unsigned maximum exponent case. */ + bltz xh, 1f + mov a2, a5 /* no shift needed */ + leaf_return + + /* Return 0x80000000 if negative. */ +1: slli a2, a6, 11 + leaf_return + +#endif /* L_fixunsdfsi */ + +#ifdef L_fixunsdfdi + + .align 4 + .global __fixunsdfdi + .type __fixunsdfdi, @function +__fixunsdfdi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7ff00000 + ball xh, a6, .Lfixunsdfdi_nan_or_inf + + /* Extract the exponent and check if 0 <= (exp - 0x3ff) < 64. */ + extui a4, xh, 20, 11 + extui a5, a6, 20, 10 /* 0x3ff */ + sub a4, a4, a5 + bgei a4, 64, .Lfixunsdfdi_maxint + bltz a4, .Lfixunsdfdi_zero + + /* Add explicit "1.0" and shift << 11. */ + or a7, xh, a6 + ssai (32 - 11) + src xh, a7, xl + sll xl, xl + + /* Shift back to the right, based on the exponent. */ + addi a4, a4, 1 + beqi a4, 64, .Lfixunsdfdi_bigexp + ssl a4 /* shift by 64 - a4 */ + bgei a4, 32, .Lfixunsdfdi_smallshift + srl xl, xh + movi xh, 0 + +.Lfixunsdfdi_shifted: + /* Negate the result if sign != 0. */ + bgez a7, 1f + neg xl, xl + neg xh, xh + beqz xl, 1f + addi xh, xh, -1 +1: leaf_return + +.Lfixunsdfdi_smallshift: + src xl, xh, xl + srl xh, xh + j .Lfixunsdfdi_shifted + +.Lfixunsdfdi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, xh, 12 + or a4, a4, xl + beqz a4, .Lfixunsdfdi_maxint + + /* Translate NaN to 0xffffffff.... */ +1: movi xh, -1 + movi xl, -1 + leaf_return + +.Lfixunsdfdi_maxint: + bgez xh, 1b +2: slli xh, a6, 11 /* 0x80000000 */ + movi xl, 0 + leaf_return + +.Lfixunsdfdi_zero: + movi xh, 0 + movi xl, 0 + leaf_return + +.Lfixunsdfdi_bigexp: + /* Handle unsigned maximum exponent case. */ + bltz a7, 2b + leaf_return /* no shift needed */ + +#endif /* L_fixunsdfdi */ + +#ifdef L_floatsidf + + .align 4 + .global __floatunsidf + .type __floatunsidf, @function +__floatunsidf: + leaf_entry sp, 16 + beqz a2, .Lfloatsidf_return_zero + + /* Set the sign to zero and jump to the floatsidf code. */ + movi a7, 0 + j .Lfloatsidf_normalize + + .align 4 + .global __floatsidf + .type __floatsidf, @function +__floatsidf: + leaf_entry sp, 16 + + /* Check for zero. */ + beqz a2, .Lfloatsidf_return_zero + + /* Save the sign. */ + extui a7, a2, 31, 1 + + /* Get the absolute value. */ +#if XCHAL_HAVE_ABS + abs a2, a2 +#else + neg a4, a2 + movltz a2, a4, a2 +#endif + +.Lfloatsidf_normalize: + /* Normalize with the first 1 bit in the msb. */ + do_nsau a4, a2, a5, a6 + ssl a4 + sll a5, a2 + + /* Shift the mantissa into position. */ + srli xh, a5, 11 + slli xl, a5, (32 - 11) + + /* Set the exponent. */ + movi a5, 0x41d /* 0x3fe + 31 */ + sub a5, a5, a4 + slli a5, a5, 20 + add xh, xh, a5 + + /* Add the sign and return. */ + slli a7, a7, 31 + or xh, xh, a7 + leaf_return + +.Lfloatsidf_return_zero: + movi a3, 0 + leaf_return + +#endif /* L_floatsidf */ + +#ifdef L_floatdidf + + .align 4 + .global __floatundidf + .type __floatundidf, @function +__floatundidf: + leaf_entry sp, 16 + + /* Check for zero. */ + or a4, xh, xl + beqz a4, 2f + + /* Set the sign to zero and jump to the floatdidf code. */ + movi a7, 0 + j .Lfloatdidf_normalize + + .align 4 + .global __floatdidf + .type __floatdidf, @function +__floatdidf: + leaf_entry sp, 16 + + /* Check for zero. */ + or a4, xh, xl + beqz a4, 2f + + /* Save the sign. */ + extui a7, xh, 31, 1 + + /* Get the absolute value. */ + bgez xh, .Lfloatdidf_normalize + neg xl, xl + neg xh, xh + beqz xl, .Lfloatdidf_normalize + addi xh, xh, -1 + +.Lfloatdidf_normalize: + /* Normalize with the first 1 bit in the msb of xh. */ + beqz xh, .Lfloatdidf_bigshift + do_nsau a4, xh, a5, a6 + ssl a4 + src xh, xh, xl + sll xl, xl + +.Lfloatdidf_shifted: + /* Shift the mantissa into position, with rounding bits in a6. */ + ssai 11 + sll a6, xl + src xl, xh, xl + srl xh, xh + + /* Set the exponent. */ + movi a5, 0x43d /* 0x3fe + 63 */ + sub a5, a5, a4 + slli a5, a5, 20 + add xh, xh, a5 + + /* Add the sign. */ + slli a7, a7, 31 + or xh, xh, a7 + + /* Round up if the leftover fraction is >= 1/2. */ + bgez a6, 2f + addi xl, xl, 1 + beqz xl, .Lfloatdidf_roundcarry + + /* Check if the leftover fraction is exactly 1/2. */ + slli a6, a6, 1 + beqz a6, .Lfloatdidf_exactlyhalf +2: leaf_return + +.Lfloatdidf_bigshift: + /* xh is zero. Normalize with first 1 bit of xl in the msb of xh. */ + do_nsau a4, xl, a5, a6 + ssl a4 + sll xh, xl + movi xl, 0 + addi a4, a4, 32 + j .Lfloatdidf_shifted + +.Lfloatdidf_exactlyhalf: + /* Round down to the nearest even value. */ + srli xl, xl, 1 + slli xl, xl, 1 + leaf_return + +.Lfloatdidf_roundcarry: + /* xl is always zero when the rounding increment overflows, so + there's no need to round it to an even value. */ + addi xh, xh, 1 + /* Overflow to the exponent is OK. */ + leaf_return + +#endif /* L_floatdidf */ + +#ifdef L_truncdfsf2 + + .align 4 + .global __truncdfsf2 + .type __truncdfsf2, @function +__truncdfsf2: + leaf_entry sp, 16 + + /* Adjust the exponent bias. */ + movi a4, (0x3ff - 0x7f) << 20 + sub a5, xh, a4 + + /* Check for underflow. */ + xor a6, xh, a5 + bltz a6, .Ltrunc_underflow + extui a6, a5, 20, 11 + beqz a6, .Ltrunc_underflow + + /* Check for overflow. */ + movi a4, 255 + bge a6, a4, .Ltrunc_overflow + + /* Shift a5/xl << 3 into a5/a4. */ + ssai (32 - 3) + src a5, a5, xl + sll a4, xl + +.Ltrunc_addsign: + /* Add the sign bit. */ + extui a6, xh, 31, 1 + slli a6, a6, 31 + or a2, a6, a5 + + /* Round up if the leftover fraction is >= 1/2. */ + bgez a4, 1f + addi a2, a2, 1 + /* Overflow to the exponent is OK. The answer will be correct. */ + + /* Check if the leftover fraction is exactly 1/2. */ + slli a4, a4, 1 + beqz a4, .Ltrunc_exactlyhalf +1: leaf_return + +.Ltrunc_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + +.Ltrunc_overflow: + /* Check if exponent == 0x7ff. */ + movi a4, 0x7ff00000 + bnall xh, a4, 1f + + /* Check if mantissa is nonzero. */ + slli a5, xh, 12 + or a5, a5, xl + beqz a5, 1f + + /* Shift a4 to set a bit in the mantissa, making a quiet NaN. */ + srli a4, a4, 1 + +1: slli a4, a4, 4 /* 0xff000000 or 0xff800000 */ + /* Add the sign bit. */ + extui a6, xh, 31, 1 + ssai 1 + src a2, a6, a4 + leaf_return + +.Ltrunc_underflow: + /* Find shift count for a subnormal. Flush to zero if >= 32. */ + extui a6, xh, 20, 11 + movi a5, 0x3ff - 0x7f + sub a6, a5, a6 + addi a6, a6, 1 + bgeui a6, 32, 1f + + /* Replace the exponent with an explicit "1.0". */ + slli a5, a5, 13 /* 0x700000 */ + or a5, a5, xh + slli a5, a5, 11 + srli a5, a5, 11 + + /* Shift the mantissa left by 3 bits (into a5/a4). */ + ssai (32 - 3) + src a5, a5, xl + sll a4, xl + + /* Shift right by a6. */ + ssr a6 + sll a7, a4 + src a4, a5, a4 + srl a5, a5 + beqz a7, .Ltrunc_addsign + or a4, a4, a6 /* any positive, nonzero value will work */ + j .Ltrunc_addsign + + /* Return +/- zero. */ +1: extui a2, xh, 31, 1 + slli a2, a2, 31 + leaf_return + +#endif /* L_truncdfsf2 */ + +#ifdef L_extendsfdf2 + + .align 4 + .global __extendsfdf2 + .type __extendsfdf2, @function +__extendsfdf2: + leaf_entry sp, 16 + + /* Save the sign bit and then shift it off. */ + extui a5, a2, 31, 1 + slli a5, a5, 31 + slli a4, a2, 1 + + /* Extract and check the exponent. */ + extui a6, a2, 23, 8 + beqz a6, .Lextend_expzero + addi a6, a6, 1 + beqi a6, 256, .Lextend_nan_or_inf + + /* Shift >> 3 into a4/xl. */ + srli a4, a4, 4 + slli xl, a2, (32 - 3) + + /* Adjust the exponent bias. */ + movi a6, (0x3ff - 0x7f) << 20 + add a4, a4, a6 + + /* Add the sign bit. */ + or xh, a4, a5 + leaf_return + +.Lextend_nan_or_inf: + movi a4, 0x7ff00000 + + /* Check for NaN. */ + slli a7, a2, 9 + beqz a7, 1f + + slli a6, a6, 11 /* 0x80000 */ + or a4, a4, a6 + + /* Add the sign and return. */ +1: or xh, a4, a5 + movi xl, 0 + leaf_return + +.Lextend_expzero: + beqz a4, 1b + + /* Normalize it to have 8 zero bits before the first 1 bit. */ + do_nsau a7, a4, a2, a3 + addi a7, a7, -8 + ssl a7 + sll a4, a4 + + /* Shift >> 3 into a4/xl. */ + slli xl, a4, (32 - 3) + srli a4, a4, 3 + + /* Set the exponent. */ + movi a6, 0x3fe - 0x7f + sub a6, a6, a7 + slli a6, a6, 20 + add a4, a4, a6 + + /* Add the sign and return. */ + or xh, a4, a5 + leaf_return + +#endif /* L_extendsfdf2 */ + + diff --git a/libgcc/config/xtensa/ieee754-sf.S b/libgcc/config/xtensa/ieee754-sf.S new file mode 100644 index 00000000000..d75be0e5ae5 --- /dev/null +++ b/libgcc/config/xtensa/ieee754-sf.S @@ -0,0 +1,1757 @@ +/* IEEE-754 single-precision functions for Xtensa + Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc. + Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef __XTENSA_EB__ +#define xh a2 +#define xl a3 +#define yh a4 +#define yl a5 +#else +#define xh a3 +#define xl a2 +#define yh a5 +#define yl a4 +#endif + +/* Warning! The branch displacements for some Xtensa branch instructions + are quite small, and this code has been carefully laid out to keep + branch targets in range. If you change anything, be sure to check that + the assembler is not relaxing anything to branch over a jump. */ + +#ifdef L_negsf2 + + .align 4 + .global __negsf2 + .type __negsf2, @function +__negsf2: + leaf_entry sp, 16 + movi a4, 0x80000000 + xor a2, a2, a4 + leaf_return + +#endif /* L_negsf2 */ + +#ifdef L_addsubsf3 + + /* Addition */ +__addsf3_aux: + + /* Handle NaNs and Infinities. (This code is placed before the + start of the function just to keep it in range of the limited + branch displacements.) */ + +.Ladd_xnan_or_inf: + /* If y is neither Infinity nor NaN, return x. */ + bnall a3, a6, 1f + /* If x is a NaN, return it. Otherwise, return y. */ + slli a7, a2, 9 + beqz a7, .Ladd_ynan_or_inf +1: leaf_return + +.Ladd_ynan_or_inf: + /* Return y. */ + mov a2, a3 + leaf_return + +.Ladd_opposite_signs: + /* Operand signs differ. Do a subtraction. */ + slli a7, a6, 8 + xor a3, a3, a7 + j .Lsub_same_sign + + .align 4 + .global __addsf3 + .type __addsf3, @function +__addsf3: + leaf_entry sp, 16 + movi a6, 0x7f800000 + + /* Check if the two operands have the same sign. */ + xor a7, a2, a3 + bltz a7, .Ladd_opposite_signs + +.Ladd_same_sign: + /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity). */ + ball a2, a6, .Ladd_xnan_or_inf + ball a3, a6, .Ladd_ynan_or_inf + + /* Compare the exponents. The smaller operand will be shifted + right by the exponent difference and added to the larger + one. */ + extui a7, a2, 23, 9 + extui a8, a3, 23, 9 + bltu a7, a8, .Ladd_shiftx + +.Ladd_shifty: + /* Check if the smaller (or equal) exponent is zero. */ + bnone a3, a6, .Ladd_yexpzero + + /* Replace y sign/exponent with 0x008. */ + or a3, a3, a6 + slli a3, a3, 8 + srli a3, a3, 8 + +.Ladd_yexpdiff: + /* Compute the exponent difference. */ + sub a10, a7, a8 + + /* Exponent difference > 32 -- just return the bigger value. */ + bgeui a10, 32, 1f + + /* Shift y right by the exponent difference. Any bits that are + shifted out of y are saved in a9 for rounding the result. */ + ssr a10 + movi a9, 0 + src a9, a3, a9 + srl a3, a3 + + /* Do the addition. */ + add a2, a2, a3 + + /* Check if the add overflowed into the exponent. */ + extui a10, a2, 23, 9 + beq a10, a7, .Ladd_round + mov a8, a7 + j .Ladd_carry + +.Ladd_yexpzero: + /* y is a subnormal value. Replace its sign/exponent with zero, + i.e., no implicit "1.0", and increment the apparent exponent + because subnormals behave as if they had the minimum (nonzero) + exponent. Test for the case when both exponents are zero. */ + slli a3, a3, 9 + srli a3, a3, 9 + bnone a2, a6, .Ladd_bothexpzero + addi a8, a8, 1 + j .Ladd_yexpdiff + +.Ladd_bothexpzero: + /* Both exponents are zero. Handle this as a special case. There + is no need to shift or round, and the normal code for handling + a carry into the exponent field will not work because it + assumes there is an implicit "1.0" that needs to be added. */ + add a2, a2, a3 +1: leaf_return + +.Ladd_xexpzero: + /* Same as "yexpzero" except skip handling the case when both + exponents are zero. */ + slli a2, a2, 9 + srli a2, a2, 9 + addi a7, a7, 1 + j .Ladd_xexpdiff + +.Ladd_shiftx: + /* Same thing as the "shifty" code, but with x and y swapped. Also, + because the exponent difference is always nonzero in this version, + the shift sequence can use SLL and skip loading a constant zero. */ + bnone a2, a6, .Ladd_xexpzero + + or a2, a2, a6 + slli a2, a2, 8 + srli a2, a2, 8 + +.Ladd_xexpdiff: + sub a10, a8, a7 + bgeui a10, 32, .Ladd_returny + + ssr a10 + sll a9, a2 + srl a2, a2 + + add a2, a2, a3 + + /* Check if the add overflowed into the exponent. */ + extui a10, a2, 23, 9 + bne a10, a8, .Ladd_carry + +.Ladd_round: + /* Round up if the leftover fraction is >= 1/2. */ + bgez a9, 1f + addi a2, a2, 1 + + /* Check if the leftover fraction is exactly 1/2. */ + slli a9, a9, 1 + beqz a9, .Ladd_exactlyhalf +1: leaf_return + +.Ladd_returny: + mov a2, a3 + leaf_return + +.Ladd_carry: + /* The addition has overflowed into the exponent field, so the + value needs to be renormalized. The mantissa of the result + can be recovered by subtracting the original exponent and + adding 0x800000 (which is the explicit "1.0" for the + mantissa of the non-shifted operand -- the "1.0" for the + shifted operand was already added). The mantissa can then + be shifted right by one bit. The explicit "1.0" of the + shifted mantissa then needs to be replaced by the exponent, + incremented by one to account for the normalizing shift. + It is faster to combine these operations: do the shift first + and combine the additions and subtractions. If x is the + original exponent, the result is: + shifted mantissa - (x << 22) + (1 << 22) + (x << 23) + or: + shifted mantissa + ((x + 1) << 22) + Note that the exponent is incremented here by leaving the + explicit "1.0" of the mantissa in the exponent field. */ + + /* Shift x right by one bit. Save the lsb. */ + mov a10, a2 + srli a2, a2, 1 + + /* See explanation above. The original exponent is in a8. */ + addi a8, a8, 1 + slli a8, a8, 22 + add a2, a2, a8 + + /* Return an Infinity if the exponent overflowed. */ + ball a2, a6, .Ladd_infinity + + /* Same thing as the "round" code except the msb of the leftover + fraction is bit 0 of a10, with the rest of the fraction in a9. */ + bbci.l a10, 0, 1f + addi a2, a2, 1 + beqz a9, .Ladd_exactlyhalf +1: leaf_return + +.Ladd_infinity: + /* Clear the mantissa. */ + srli a2, a2, 23 + slli a2, a2, 23 + + /* The sign bit may have been lost in a carry-out. Put it back. */ + slli a8, a8, 1 + or a2, a2, a8 + leaf_return + +.Ladd_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + + + /* Subtraction */ +__subsf3_aux: + + /* Handle NaNs and Infinities. (This code is placed before the + start of the function just to keep it in range of the limited + branch displacements.) */ + +.Lsub_xnan_or_inf: + /* If y is neither Infinity nor NaN, return x. */ + bnall a3, a6, 1f + /* Both x and y are either NaN or Inf, so the result is NaN. */ + movi a4, 0x400000 /* make it a quiet NaN */ + or a2, a2, a4 +1: leaf_return + +.Lsub_ynan_or_inf: + /* Negate y and return it. */ + slli a7, a6, 8 + xor a2, a3, a7 + leaf_return + +.Lsub_opposite_signs: + /* Operand signs differ. Do an addition. */ + slli a7, a6, 8 + xor a3, a3, a7 + j .Ladd_same_sign + + .align 4 + .global __subsf3 + .type __subsf3, @function +__subsf3: + leaf_entry sp, 16 + movi a6, 0x7f800000 + + /* Check if the two operands have the same sign. */ + xor a7, a2, a3 + bltz a7, .Lsub_opposite_signs + +.Lsub_same_sign: + /* Check if either exponent == 0x7f8 (i.e., NaN or Infinity). */ + ball a2, a6, .Lsub_xnan_or_inf + ball a3, a6, .Lsub_ynan_or_inf + + /* Compare the operands. In contrast to addition, the entire + value matters here. */ + extui a7, a2, 23, 8 + extui a8, a3, 23, 8 + bltu a2, a3, .Lsub_xsmaller + +.Lsub_ysmaller: + /* Check if the smaller (or equal) exponent is zero. */ + bnone a3, a6, .Lsub_yexpzero + + /* Replace y sign/exponent with 0x008. */ + or a3, a3, a6 + slli a3, a3, 8 + srli a3, a3, 8 + +.Lsub_yexpdiff: + /* Compute the exponent difference. */ + sub a10, a7, a8 + + /* Exponent difference > 32 -- just return the bigger value. */ + bgeui a10, 32, 1f + + /* Shift y right by the exponent difference. Any bits that are + shifted out of y are saved in a9 for rounding the result. */ + ssr a10 + movi a9, 0 + src a9, a3, a9 + srl a3, a3 + + sub a2, a2, a3 + + /* Subtract the leftover bits in a9 from zero and propagate any + borrow from a2. */ + neg a9, a9 + addi a10, a2, -1 + movnez a2, a10, a9 + + /* Check if the subtract underflowed into the exponent. */ + extui a10, a2, 23, 8 + beq a10, a7, .Lsub_round + j .Lsub_borrow + +.Lsub_yexpzero: + /* Return zero if the inputs are equal. (For the non-subnormal + case, subtracting the "1.0" will cause a borrow from the exponent + and this case can be detected when handling the borrow.) */ + beq a2, a3, .Lsub_return_zero + + /* y is a subnormal value. Replace its sign/exponent with zero, + i.e., no implicit "1.0". Unless x is also a subnormal, increment + y's apparent exponent because subnormals behave as if they had + the minimum (nonzero) exponent. */ + slli a3, a3, 9 + srli a3, a3, 9 + bnone a2, a6, .Lsub_yexpdiff + addi a8, a8, 1 + j .Lsub_yexpdiff + +.Lsub_returny: + /* Negate and return y. */ + slli a7, a6, 8 + xor a2, a3, a7 +1: leaf_return + +.Lsub_xsmaller: + /* Same thing as the "ysmaller" code, but with x and y swapped and + with y negated. */ + bnone a2, a6, .Lsub_xexpzero + + or a2, a2, a6 + slli a2, a2, 8 + srli a2, a2, 8 + +.Lsub_xexpdiff: + sub a10, a8, a7 + bgeui a10, 32, .Lsub_returny + + ssr a10 + movi a9, 0 + src a9, a2, a9 + srl a2, a2 + + /* Negate y. */ + slli a11, a6, 8 + xor a3, a3, a11 + + sub a2, a3, a2 + + neg a9, a9 + addi a10, a2, -1 + movnez a2, a10, a9 + + /* Check if the subtract underflowed into the exponent. */ + extui a10, a2, 23, 8 + bne a10, a8, .Lsub_borrow + +.Lsub_round: + /* Round up if the leftover fraction is >= 1/2. */ + bgez a9, 1f + addi a2, a2, 1 + + /* Check if the leftover fraction is exactly 1/2. */ + slli a9, a9, 1 + beqz a9, .Lsub_exactlyhalf +1: leaf_return + +.Lsub_xexpzero: + /* Same as "yexpzero". */ + beq a2, a3, .Lsub_return_zero + slli a2, a2, 9 + srli a2, a2, 9 + bnone a3, a6, .Lsub_xexpdiff + addi a7, a7, 1 + j .Lsub_xexpdiff + +.Lsub_return_zero: + movi a2, 0 + leaf_return + +.Lsub_borrow: + /* The subtraction has underflowed into the exponent field, so the + value needs to be renormalized. Shift the mantissa left as + needed to remove any leading zeros and adjust the exponent + accordingly. If the exponent is not large enough to remove + all the leading zeros, the result will be a subnormal value. */ + + slli a8, a2, 9 + beqz a8, .Lsub_xzero + do_nsau a6, a8, a7, a11 + srli a8, a8, 9 + bge a6, a10, .Lsub_subnormal + addi a6, a6, 1 + +.Lsub_normalize_shift: + /* Shift the mantissa (a8/a9) left by a6. */ + ssl a6 + src a8, a8, a9 + sll a9, a9 + + /* Combine the shifted mantissa with the sign and exponent, + decrementing the exponent by a6. (The exponent has already + been decremented by one due to the borrow from the subtraction, + but adding the mantissa will increment the exponent by one.) */ + srli a2, a2, 23 + sub a2, a2, a6 + slli a2, a2, 23 + add a2, a2, a8 + j .Lsub_round + +.Lsub_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + +.Lsub_xzero: + /* If there was a borrow from the exponent, and the mantissa and + guard digits are all zero, then the inputs were equal and the + result should be zero. */ + beqz a9, .Lsub_return_zero + + /* Only the guard digit is nonzero. Shift by min(24, a10). */ + addi a11, a10, -24 + movi a6, 24 + movltz a6, a10, a11 + j .Lsub_normalize_shift + +.Lsub_subnormal: + /* The exponent is too small to shift away all the leading zeros. + Set a6 to the current exponent (which has already been + decremented by the borrow) so that the exponent of the result + will be zero. Do not add 1 to a6 in this case, because: (1) + adding the mantissa will not increment the exponent, so there is + no need to subtract anything extra from the exponent to + compensate, and (2) the effective exponent of a subnormal is 1 + not 0 so the shift amount must be 1 smaller than normal. */ + mov a6, a10 + j .Lsub_normalize_shift + +#endif /* L_addsubsf3 */ + +#ifdef L_mulsf3 + + /* Multiplication */ +#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 +#define XCHAL_NO_MUL 1 +#endif + +__mulsf3_aux: + + /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). + (This code is placed before the start of the function just to + keep it in range of the limited branch displacements.) */ + +.Lmul_xexpzero: + /* Clear the sign bit of x. */ + slli a2, a2, 1 + srli a2, a2, 1 + + /* If x is zero, return zero. */ + beqz a2, .Lmul_return_zero + + /* Normalize x. Adjust the exponent in a8. */ + do_nsau a10, a2, a11, a12 + addi a10, a10, -8 + ssl a10 + sll a2, a2 + movi a8, 1 + sub a8, a8, a10 + j .Lmul_xnormalized + +.Lmul_yexpzero: + /* Clear the sign bit of y. */ + slli a3, a3, 1 + srli a3, a3, 1 + + /* If y is zero, return zero. */ + beqz a3, .Lmul_return_zero + + /* Normalize y. Adjust the exponent in a9. */ + do_nsau a10, a3, a11, a12 + addi a10, a10, -8 + ssl a10 + sll a3, a3 + movi a9, 1 + sub a9, a9, a10 + j .Lmul_ynormalized + +.Lmul_return_zero: + /* Return zero with the appropriate sign bit. */ + srli a2, a7, 31 + slli a2, a2, 31 + j .Lmul_done + +.Lmul_xnan_or_inf: + /* If y is zero, return NaN. */ + slli a8, a3, 1 + bnez a8, 1f + movi a4, 0x400000 /* make it a quiet NaN */ + or a2, a2, a4 + j .Lmul_done +1: + /* If y is NaN, return y. */ + bnall a3, a6, .Lmul_returnx + slli a8, a3, 9 + beqz a8, .Lmul_returnx + +.Lmul_returny: + mov a2, a3 + +.Lmul_returnx: + /* Set the sign bit and return. */ + extui a7, a7, 31, 1 + slli a2, a2, 1 + ssai 1 + src a2, a7, a2 + j .Lmul_done + +.Lmul_ynan_or_inf: + /* If x is zero, return NaN. */ + slli a8, a2, 1 + bnez a8, .Lmul_returny + movi a7, 0x400000 /* make it a quiet NaN */ + or a2, a3, a7 + j .Lmul_done + + .align 4 + .global __mulsf3 + .type __mulsf3, @function +__mulsf3: +#if __XTENSA_CALL0_ABI__ + leaf_entry sp, 32 + addi sp, sp, -32 + s32i a12, sp, 16 + s32i a13, sp, 20 + s32i a14, sp, 24 + s32i a15, sp, 28 +#elif XCHAL_NO_MUL + /* This is not really a leaf function; allocate enough stack space + to allow CALL12s to a helper function. */ + leaf_entry sp, 64 +#else + leaf_entry sp, 32 +#endif + movi a6, 0x7f800000 + + /* Get the sign of the result. */ + xor a7, a2, a3 + + /* Check for NaN and infinity. */ + ball a2, a6, .Lmul_xnan_or_inf + ball a3, a6, .Lmul_ynan_or_inf + + /* Extract the exponents. */ + extui a8, a2, 23, 8 + extui a9, a3, 23, 8 + + beqz a8, .Lmul_xexpzero +.Lmul_xnormalized: + beqz a9, .Lmul_yexpzero +.Lmul_ynormalized: + + /* Add the exponents. */ + add a8, a8, a9 + + /* Replace sign/exponent fields with explicit "1.0". */ + movi a10, 0xffffff + or a2, a2, a6 + and a2, a2, a10 + or a3, a3, a6 + and a3, a3, a10 + + /* Multiply 32x32 to 64 bits. The result ends up in a2/a6. */ + +#if XCHAL_HAVE_MUL32_HIGH + + mull a6, a2, a3 + muluh a2, a2, a3 + +#else + + /* Break the inputs into 16-bit chunks and compute 4 32-bit partial + products. These partial products are: + + 0 xl * yl + + 1 xl * yh + 2 xh * yl + + 3 xh * yh + + If using the Mul16 or Mul32 multiplier options, these input + chunks must be stored in separate registers. For Mac16, the + UMUL.AA.* opcodes can specify that the inputs come from either + half of the registers, so there is no need to shift them out + ahead of time. If there is no multiply hardware, the 16-bit + chunks can be extracted when setting up the arguments to the + separate multiply function. */ + +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* Calling a separate multiply function will clobber a0 and requires + use of a8 as a temporary, so save those values now. (The function + uses a custom ABI so nothing else needs to be saved.) */ + s32i a0, sp, 0 + s32i a8, sp, 4 +#endif + +#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 + +#define a2h a4 +#define a3h a5 + + /* Get the high halves of the inputs into registers. */ + srli a2h, a2, 16 + srli a3h, a3, 16 + +#define a2l a2 +#define a3l a3 + +#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 + /* Clear the high halves of the inputs. This does not matter + for MUL16 because the high bits are ignored. */ + extui a2, a2, 0, 16 + extui a3, a3, 0, 16 +#endif +#endif /* MUL16 || MUL32 */ + + +#if XCHAL_HAVE_MUL16 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mul16u dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MUL32 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mull dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MAC16 + +/* The preprocessor insists on inserting a space when concatenating after + a period in the definition of do_mul below. These macros are a workaround + using underscores instead of periods when doing the concatenation. */ +#define umul_aa_ll umul.aa.ll +#define umul_aa_lh umul.aa.lh +#define umul_aa_hl umul.aa.hl +#define umul_aa_hh umul.aa.hh + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + umul_aa_ ## xhalf ## yhalf xreg, yreg; \ + rsr dst, ACCLO + +#else /* no multiply hardware */ + +#define set_arg_l(dst, src) \ + extui dst, src, 0, 16 +#define set_arg_h(dst, src) \ + srli dst, src, 16 + +#if __XTENSA_CALL0_ABI__ +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a13, xreg); \ + set_arg_ ## yhalf (a14, yreg); \ + call0 .Lmul_mulsi3; \ + mov dst, a12 +#else +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a14, xreg); \ + set_arg_ ## yhalf (a15, yreg); \ + call12 .Lmul_mulsi3; \ + mov dst, a14 +#endif /* __XTENSA_CALL0_ABI__ */ + +#endif /* no multiply hardware */ + + /* Add pp1 and pp2 into a6 with carry-out in a9. */ + do_mul(a6, a2, l, a3, h) /* pp 1 */ + do_mul(a11, a2, h, a3, l) /* pp 2 */ + movi a9, 0 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Shift the high half of a9/a6 into position in a9. Note that + this value can be safely incremented without any carry-outs. */ + ssai 16 + src a9, a9, a6 + + /* Compute the low word into a6. */ + do_mul(a11, a2, l, a3, l) /* pp 0 */ + sll a6, a6 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Compute the high word into a2. */ + do_mul(a2, a2, h, a3, h) /* pp 3 */ + add a2, a2, a9 + +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* Restore values saved on the stack during the multiplication. */ + l32i a0, sp, 0 + l32i a8, sp, 4 +#endif +#endif /* ! XCHAL_HAVE_MUL32_HIGH */ + + /* Shift left by 9 bits, unless there was a carry-out from the + multiply, in which case, shift by 8 bits and increment the + exponent. */ + movi a4, 9 + srli a5, a2, 24 - 9 + beqz a5, 1f + addi a4, a4, -1 + addi a8, a8, 1 +1: ssl a4 + src a2, a2, a6 + sll a6, a6 + + /* Subtract the extra bias from the exponent sum (plus one to account + for the explicit "1.0" of the mantissa that will be added to the + exponent in the final result). */ + movi a4, 0x80 + sub a8, a8, a4 + + /* Check for over/underflow. The value in a8 is one less than the + final exponent, so values in the range 0..fd are OK here. */ + movi a4, 0xfe + bgeu a8, a4, .Lmul_overflow + +.Lmul_round: + /* Round. */ + bgez a6, .Lmul_rounded + addi a2, a2, 1 + slli a6, a6, 1 + beqz a6, .Lmul_exactlyhalf + +.Lmul_rounded: + /* Add the exponent to the mantissa. */ + slli a8, a8, 23 + add a2, a2, a8 + +.Lmul_addsign: + /* Add the sign bit. */ + srli a7, a7, 31 + slli a7, a7, 31 + or a2, a2, a7 + +.Lmul_done: +#if __XTENSA_CALL0_ABI__ + l32i a12, sp, 16 + l32i a13, sp, 20 + l32i a14, sp, 24 + l32i a15, sp, 28 + addi sp, sp, 32 +#endif + leaf_return + +.Lmul_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + j .Lmul_rounded + +.Lmul_overflow: + bltz a8, .Lmul_underflow + /* Return +/- Infinity. */ + movi a8, 0xff + slli a2, a8, 23 + j .Lmul_addsign + +.Lmul_underflow: + /* Create a subnormal value, where the exponent field contains zero, + but the effective exponent is 1. The value of a8 is one less than + the actual exponent, so just negate it to get the shift amount. */ + neg a8, a8 + mov a9, a6 + ssr a8 + bgeui a8, 32, .Lmul_flush_to_zero + + /* Shift a2 right. Any bits that are shifted out of a2 are saved + in a6 (combined with the shifted-out bits currently in a6) for + rounding the result. */ + sll a6, a2 + srl a2, a2 + + /* Set the exponent to zero. */ + movi a8, 0 + + /* Pack any nonzero bits shifted out into a6. */ + beqz a9, .Lmul_round + movi a9, 1 + or a6, a6, a9 + j .Lmul_round + +.Lmul_flush_to_zero: + /* Return zero with the appropriate sign bit. */ + srli a2, a7, 31 + slli a2, a2, 31 + j .Lmul_done + +#if XCHAL_NO_MUL + + /* For Xtensa processors with no multiply hardware, this simplified + version of _mulsi3 is used for multiplying 16-bit chunks of + the floating-point mantissas. When using CALL0, this function + uses a custom ABI: the inputs are passed in a13 and a14, the + result is returned in a12, and a8 and a15 are clobbered. */ + .align 4 +.Lmul_mulsi3: + leaf_entry sp, 16 + .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 + movi \dst, 0 +1: add \tmp1, \src2, \dst + extui \tmp2, \src1, 0, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx2 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 1, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx4 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 2, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx8 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 3, 1 + movnez \dst, \tmp1, \tmp2 + + srli \src1, \src1, 4 + slli \src2, \src2, 4 + bnez \src1, 1b + .endm +#if __XTENSA_CALL0_ABI__ + mul_mulsi3_body a12, a13, a14, a15, a8 +#else + /* The result will be written into a2, so save that argument in a4. */ + mov a4, a2 + mul_mulsi3_body a2, a4, a3, a5, a6 +#endif + leaf_return +#endif /* XCHAL_NO_MUL */ +#endif /* L_mulsf3 */ + +#ifdef L_divsf3 + + /* Division */ +__divsf3_aux: + + /* Handle unusual cases (zeros, subnormals, NaNs and Infinities). + (This code is placed before the start of the function just to + keep it in range of the limited branch displacements.) */ + +.Ldiv_yexpzero: + /* Clear the sign bit of y. */ + slli a3, a3, 1 + srli a3, a3, 1 + + /* Check for division by zero. */ + beqz a3, .Ldiv_yzero + + /* Normalize y. Adjust the exponent in a9. */ + do_nsau a10, a3, a4, a5 + addi a10, a10, -8 + ssl a10 + sll a3, a3 + movi a9, 1 + sub a9, a9, a10 + j .Ldiv_ynormalized + +.Ldiv_yzero: + /* y is zero. Return NaN if x is also zero; otherwise, infinity. */ + slli a4, a2, 1 + srli a4, a4, 1 + srli a2, a7, 31 + slli a2, a2, 31 + or a2, a2, a6 + bnez a4, 1f + movi a4, 0x400000 /* make it a quiet NaN */ + or a2, a2, a4 +1: leaf_return + +.Ldiv_xexpzero: + /* Clear the sign bit of x. */ + slli a2, a2, 1 + srli a2, a2, 1 + + /* If x is zero, return zero. */ + beqz a2, .Ldiv_return_zero + + /* Normalize x. Adjust the exponent in a8. */ + do_nsau a10, a2, a4, a5 + addi a10, a10, -8 + ssl a10 + sll a2, a2 + movi a8, 1 + sub a8, a8, a10 + j .Ldiv_xnormalized + +.Ldiv_return_zero: + /* Return zero with the appropriate sign bit. */ + srli a2, a7, 31 + slli a2, a2, 31 + leaf_return + +.Ldiv_xnan_or_inf: + /* Set the sign bit of the result. */ + srli a7, a3, 31 + slli a7, a7, 31 + xor a2, a2, a7 + /* If y is NaN or Inf, return NaN. */ + bnall a3, a6, 1f + movi a4, 0x400000 /* make it a quiet NaN */ + or a2, a2, a4 +1: leaf_return + +.Ldiv_ynan_or_inf: + /* If y is Infinity, return zero. */ + slli a8, a3, 9 + beqz a8, .Ldiv_return_zero + /* y is NaN; return it. */ + mov a2, a3 + leaf_return + + .align 4 + .global __divsf3 + .type __divsf3, @function +__divsf3: + leaf_entry sp, 16 + movi a6, 0x7f800000 + + /* Get the sign of the result. */ + xor a7, a2, a3 + + /* Check for NaN and infinity. */ + ball a2, a6, .Ldiv_xnan_or_inf + ball a3, a6, .Ldiv_ynan_or_inf + + /* Extract the exponents. */ + extui a8, a2, 23, 8 + extui a9, a3, 23, 8 + + beqz a9, .Ldiv_yexpzero +.Ldiv_ynormalized: + beqz a8, .Ldiv_xexpzero +.Ldiv_xnormalized: + + /* Subtract the exponents. */ + sub a8, a8, a9 + + /* Replace sign/exponent fields with explicit "1.0". */ + movi a10, 0xffffff + or a2, a2, a6 + and a2, a2, a10 + or a3, a3, a6 + and a3, a3, a10 + + /* The first digit of the mantissa division must be a one. + Shift x (and adjust the exponent) as needed to make this true. */ + bltu a3, a2, 1f + slli a2, a2, 1 + addi a8, a8, -1 +1: + /* Do the first subtraction and shift. */ + sub a2, a2, a3 + slli a2, a2, 1 + + /* Put the quotient into a10. */ + movi a10, 1 + + /* Divide one bit at a time for 23 bits. */ + movi a9, 23 +#if XCHAL_HAVE_LOOPS + loop a9, .Ldiv_loopend +#endif +.Ldiv_loop: + /* Shift the quotient << 1. */ + slli a10, a10, 1 + + /* Is this digit a 0 or 1? */ + bltu a2, a3, 1f + + /* Output a 1 and subtract. */ + addi a10, a10, 1 + sub a2, a2, a3 + + /* Shift the dividend << 1. */ +1: slli a2, a2, 1 + +#if !XCHAL_HAVE_LOOPS + addi a9, a9, -1 + bnez a9, .Ldiv_loop +#endif +.Ldiv_loopend: + + /* Add the exponent bias (less one to account for the explicit "1.0" + of the mantissa that will be added to the exponent in the final + result). */ + addi a8, a8, 0x7e + + /* Check for over/underflow. The value in a8 is one less than the + final exponent, so values in the range 0..fd are OK here. */ + movi a4, 0xfe + bgeu a8, a4, .Ldiv_overflow + +.Ldiv_round: + /* Round. The remainder (<< 1) is in a2. */ + bltu a2, a3, .Ldiv_rounded + addi a10, a10, 1 + beq a2, a3, .Ldiv_exactlyhalf + +.Ldiv_rounded: + /* Add the exponent to the mantissa. */ + slli a8, a8, 23 + add a2, a10, a8 + +.Ldiv_addsign: + /* Add the sign bit. */ + srli a7, a7, 31 + slli a7, a7, 31 + or a2, a2, a7 + leaf_return + +.Ldiv_overflow: + bltz a8, .Ldiv_underflow + /* Return +/- Infinity. */ + addi a8, a4, 1 /* 0xff */ + slli a2, a8, 23 + j .Ldiv_addsign + +.Ldiv_exactlyhalf: + /* Remainder is exactly half the divisor. Round even. */ + srli a10, a10, 1 + slli a10, a10, 1 + j .Ldiv_rounded + +.Ldiv_underflow: + /* Create a subnormal value, where the exponent field contains zero, + but the effective exponent is 1. The value of a8 is one less than + the actual exponent, so just negate it to get the shift amount. */ + neg a8, a8 + ssr a8 + bgeui a8, 32, .Ldiv_flush_to_zero + + /* Shift a10 right. Any bits that are shifted out of a10 are + saved in a6 for rounding the result. */ + sll a6, a10 + srl a10, a10 + + /* Set the exponent to zero. */ + movi a8, 0 + + /* Pack any nonzero remainder (in a2) into a6. */ + beqz a2, 1f + movi a9, 1 + or a6, a6, a9 + + /* Round a10 based on the bits shifted out into a6. */ +1: bgez a6, .Ldiv_rounded + addi a10, a10, 1 + slli a6, a6, 1 + bnez a6, .Ldiv_rounded + srli a10, a10, 1 + slli a10, a10, 1 + j .Ldiv_rounded + +.Ldiv_flush_to_zero: + /* Return zero with the appropriate sign bit. */ + srli a2, a7, 31 + slli a2, a2, 31 + leaf_return + +#endif /* L_divsf3 */ + +#ifdef L_cmpsf2 + + /* Equal and Not Equal */ + + .align 4 + .global __eqsf2 + .global __nesf2 + .set __nesf2, __eqsf2 + .type __eqsf2, @function +__eqsf2: + leaf_entry sp, 16 + bne a2, a3, 4f + + /* The values are equal but NaN != NaN. Check the exponent. */ + movi a6, 0x7f800000 + ball a2, a6, 3f + + /* Equal. */ + movi a2, 0 + leaf_return + + /* Not equal. */ +2: movi a2, 1 + leaf_return + + /* Check if the mantissas are nonzero. */ +3: slli a7, a2, 9 + j 5f + + /* Check if x and y are zero with different signs. */ +4: or a7, a2, a3 + slli a7, a7, 1 + + /* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa + or x when exponent(x) = 0x7f8 and x == y. */ +5: movi a2, 0 + movi a3, 1 + movnez a2, a3, a7 + leaf_return + + + /* Greater Than */ + + .align 4 + .global __gtsf2 + .type __gtsf2, @function +__gtsf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 2f +1: bnall a3, a6, .Lle_cmp + + /* Check if y is a NaN. */ + slli a7, a3, 9 + beqz a7, .Lle_cmp + movi a2, 0 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, a2, 9 + beqz a7, 1b + movi a2, 0 + leaf_return + + + /* Less Than or Equal */ + + .align 4 + .global __lesf2 + .type __lesf2, @function +__lesf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 2f +1: bnall a3, a6, .Lle_cmp + + /* Check if y is a NaN. */ + slli a7, a3, 9 + beqz a7, .Lle_cmp + movi a2, 1 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, a2, 9 + beqz a7, 1b + movi a2, 1 + leaf_return + +.Lle_cmp: + /* Check if x and y have different signs. */ + xor a7, a2, a3 + bltz a7, .Lle_diff_signs + + /* Check if x is negative. */ + bltz a2, .Lle_xneg + + /* Check if x <= y. */ + bltu a3, a2, 5f +4: movi a2, 0 + leaf_return + +.Lle_xneg: + /* Check if y <= x. */ + bgeu a2, a3, 4b +5: movi a2, 1 + leaf_return + +.Lle_diff_signs: + bltz a2, 4b + + /* Check if both x and y are zero. */ + or a7, a2, a3 + slli a7, a7, 1 + movi a2, 1 + movi a3, 0 + moveqz a2, a3, a7 + leaf_return + + + /* Greater Than or Equal */ + + .align 4 + .global __gesf2 + .type __gesf2, @function +__gesf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 2f +1: bnall a3, a6, .Llt_cmp + + /* Check if y is a NaN. */ + slli a7, a3, 9 + beqz a7, .Llt_cmp + movi a2, -1 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, a2, 9 + beqz a7, 1b + movi a2, -1 + leaf_return + + + /* Less Than */ + + .align 4 + .global __ltsf2 + .type __ltsf2, @function +__ltsf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 2f +1: bnall a3, a6, .Llt_cmp + + /* Check if y is a NaN. */ + slli a7, a3, 9 + beqz a7, .Llt_cmp + movi a2, 0 + leaf_return + + /* Check if x is a NaN. */ +2: slli a7, a2, 9 + beqz a7, 1b + movi a2, 0 + leaf_return + +.Llt_cmp: + /* Check if x and y have different signs. */ + xor a7, a2, a3 + bltz a7, .Llt_diff_signs + + /* Check if x is negative. */ + bltz a2, .Llt_xneg + + /* Check if x < y. */ + bgeu a2, a3, 5f +4: movi a2, -1 + leaf_return + +.Llt_xneg: + /* Check if y < x. */ + bltu a3, a2, 4b +5: movi a2, 0 + leaf_return + +.Llt_diff_signs: + bgez a2, 5b + + /* Check if both x and y are nonzero. */ + or a7, a2, a3 + slli a7, a7, 1 + movi a2, 0 + movi a3, -1 + movnez a2, a3, a7 + leaf_return + + + /* Unordered */ + + .align 4 + .global __unordsf2 + .type __unordsf2, @function +__unordsf2: + leaf_entry sp, 16 + movi a6, 0x7f800000 + ball a2, a6, 3f +1: ball a3, a6, 4f +2: movi a2, 0 + leaf_return + +3: slli a7, a2, 9 + beqz a7, 1b + movi a2, 1 + leaf_return + +4: slli a7, a3, 9 + beqz a7, 2b + movi a2, 1 + leaf_return + +#endif /* L_cmpsf2 */ + +#ifdef L_fixsfsi + + .align 4 + .global __fixsfsi + .type __fixsfsi, @function +__fixsfsi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7f800000 + ball a2, a6, .Lfixsfsi_nan_or_inf + + /* Extract the exponent and check if 0 < (exp - 0x7e) < 32. */ + extui a4, a2, 23, 8 + addi a4, a4, -0x7e + bgei a4, 32, .Lfixsfsi_maxint + blti a4, 1, .Lfixsfsi_zero + + /* Add explicit "1.0" and shift << 8. */ + or a7, a2, a6 + slli a5, a7, 8 + + /* Shift back to the right, based on the exponent. */ + ssl a4 /* shift by 32 - a4 */ + srl a5, a5 + + /* Negate the result if sign != 0. */ + neg a2, a5 + movgez a2, a5, a7 + leaf_return + +.Lfixsfsi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, a2, 9 + beqz a4, .Lfixsfsi_maxint + + /* Translate NaN to +maxint. */ + movi a2, 0 + +.Lfixsfsi_maxint: + slli a4, a6, 8 /* 0x80000000 */ + addi a5, a4, -1 /* 0x7fffffff */ + movgez a4, a5, a2 + mov a2, a4 + leaf_return + +.Lfixsfsi_zero: + movi a2, 0 + leaf_return + +#endif /* L_fixsfsi */ + +#ifdef L_fixsfdi + + .align 4 + .global __fixsfdi + .type __fixsfdi, @function +__fixsfdi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7f800000 + ball a2, a6, .Lfixsfdi_nan_or_inf + + /* Extract the exponent and check if 0 < (exp - 0x7e) < 64. */ + extui a4, a2, 23, 8 + addi a4, a4, -0x7e + bgei a4, 64, .Lfixsfdi_maxint + blti a4, 1, .Lfixsfdi_zero + + /* Add explicit "1.0" and shift << 8. */ + or a7, a2, a6 + slli xh, a7, 8 + + /* Shift back to the right, based on the exponent. */ + ssl a4 /* shift by 64 - a4 */ + bgei a4, 32, .Lfixsfdi_smallshift + srl xl, xh + movi xh, 0 + +.Lfixsfdi_shifted: + /* Negate the result if sign != 0. */ + bgez a7, 1f + neg xl, xl + neg xh, xh + beqz xl, 1f + addi xh, xh, -1 +1: leaf_return + +.Lfixsfdi_smallshift: + movi xl, 0 + sll xl, xh + srl xh, xh + j .Lfixsfdi_shifted + +.Lfixsfdi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, a2, 9 + beqz a4, .Lfixsfdi_maxint + + /* Translate NaN to +maxint. */ + movi a2, 0 + +.Lfixsfdi_maxint: + slli a7, a6, 8 /* 0x80000000 */ + bgez a2, 1f + mov xh, a7 + movi xl, 0 + leaf_return + +1: addi xh, a7, -1 /* 0x7fffffff */ + movi xl, -1 + leaf_return + +.Lfixsfdi_zero: + movi xh, 0 + movi xl, 0 + leaf_return + +#endif /* L_fixsfdi */ + +#ifdef L_fixunssfsi + + .align 4 + .global __fixunssfsi + .type __fixunssfsi, @function +__fixunssfsi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7f800000 + ball a2, a6, .Lfixunssfsi_nan_or_inf + + /* Extract the exponent and check if 0 <= (exp - 0x7f) < 32. */ + extui a4, a2, 23, 8 + addi a4, a4, -0x7f + bgei a4, 32, .Lfixunssfsi_maxint + bltz a4, .Lfixunssfsi_zero + + /* Add explicit "1.0" and shift << 8. */ + or a7, a2, a6 + slli a5, a7, 8 + + /* Shift back to the right, based on the exponent. */ + addi a4, a4, 1 + beqi a4, 32, .Lfixunssfsi_bigexp + ssl a4 /* shift by 32 - a4 */ + srl a5, a5 + + /* Negate the result if sign != 0. */ + neg a2, a5 + movgez a2, a5, a7 + leaf_return + +.Lfixunssfsi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, a2, 9 + beqz a4, .Lfixunssfsi_maxint + + /* Translate NaN to 0xffffffff. */ + movi a2, -1 + leaf_return + +.Lfixunssfsi_maxint: + slli a4, a6, 8 /* 0x80000000 */ + movi a5, -1 /* 0xffffffff */ + movgez a4, a5, a2 + mov a2, a4 + leaf_return + +.Lfixunssfsi_zero: + movi a2, 0 + leaf_return + +.Lfixunssfsi_bigexp: + /* Handle unsigned maximum exponent case. */ + bltz a2, 1f + mov a2, a5 /* no shift needed */ + leaf_return + + /* Return 0x80000000 if negative. */ +1: slli a2, a6, 8 + leaf_return + +#endif /* L_fixunssfsi */ + +#ifdef L_fixunssfdi + + .align 4 + .global __fixunssfdi + .type __fixunssfdi, @function +__fixunssfdi: + leaf_entry sp, 16 + + /* Check for NaN and Infinity. */ + movi a6, 0x7f800000 + ball a2, a6, .Lfixunssfdi_nan_or_inf + + /* Extract the exponent and check if 0 <= (exp - 0x7f) < 64. */ + extui a4, a2, 23, 8 + addi a4, a4, -0x7f + bgei a4, 64, .Lfixunssfdi_maxint + bltz a4, .Lfixunssfdi_zero + + /* Add explicit "1.0" and shift << 8. */ + or a7, a2, a6 + slli xh, a7, 8 + + /* Shift back to the right, based on the exponent. */ + addi a4, a4, 1 + beqi a4, 64, .Lfixunssfdi_bigexp + ssl a4 /* shift by 64 - a4 */ + bgei a4, 32, .Lfixunssfdi_smallshift + srl xl, xh + movi xh, 0 + +.Lfixunssfdi_shifted: + /* Negate the result if sign != 0. */ + bgez a7, 1f + neg xl, xl + neg xh, xh + beqz xl, 1f + addi xh, xh, -1 +1: leaf_return + +.Lfixunssfdi_smallshift: + movi xl, 0 + src xl, xh, xl + srl xh, xh + j .Lfixunssfdi_shifted + +.Lfixunssfdi_nan_or_inf: + /* Handle Infinity and NaN. */ + slli a4, a2, 9 + beqz a4, .Lfixunssfdi_maxint + + /* Translate NaN to 0xffffffff.... */ +1: movi xh, -1 + movi xl, -1 + leaf_return + +.Lfixunssfdi_maxint: + bgez a2, 1b +2: slli xh, a6, 8 /* 0x80000000 */ + movi xl, 0 + leaf_return + +.Lfixunssfdi_zero: + movi xh, 0 + movi xl, 0 + leaf_return + +.Lfixunssfdi_bigexp: + /* Handle unsigned maximum exponent case. */ + bltz a7, 2b + movi xl, 0 + leaf_return /* no shift needed */ + +#endif /* L_fixunssfdi */ + +#ifdef L_floatsisf + + .align 4 + .global __floatunsisf + .type __floatunsisf, @function +__floatunsisf: + leaf_entry sp, 16 + beqz a2, .Lfloatsisf_return + + /* Set the sign to zero and jump to the floatsisf code. */ + movi a7, 0 + j .Lfloatsisf_normalize + + .align 4 + .global __floatsisf + .type __floatsisf, @function +__floatsisf: + leaf_entry sp, 16 + + /* Check for zero. */ + beqz a2, .Lfloatsisf_return + + /* Save the sign. */ + extui a7, a2, 31, 1 + + /* Get the absolute value. */ +#if XCHAL_HAVE_ABS + abs a2, a2 +#else + neg a4, a2 + movltz a2, a4, a2 +#endif + +.Lfloatsisf_normalize: + /* Normalize with the first 1 bit in the msb. */ + do_nsau a4, a2, a5, a6 + ssl a4 + sll a5, a2 + + /* Shift the mantissa into position, with rounding bits in a6. */ + srli a2, a5, 8 + slli a6, a5, (32 - 8) + + /* Set the exponent. */ + movi a5, 0x9d /* 0x7e + 31 */ + sub a5, a5, a4 + slli a5, a5, 23 + add a2, a2, a5 + + /* Add the sign. */ + slli a7, a7, 31 + or a2, a2, a7 + + /* Round up if the leftover fraction is >= 1/2. */ + bgez a6, .Lfloatsisf_return + addi a2, a2, 1 /* Overflow to the exponent is OK. */ + + /* Check if the leftover fraction is exactly 1/2. */ + slli a6, a6, 1 + beqz a6, .Lfloatsisf_exactlyhalf + +.Lfloatsisf_return: + leaf_return + +.Lfloatsisf_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + +#endif /* L_floatsisf */ + +#ifdef L_floatdisf + + .align 4 + .global __floatundisf + .type __floatundisf, @function +__floatundisf: + leaf_entry sp, 16 + + /* Check for zero. */ + or a4, xh, xl + beqz a4, 2f + + /* Set the sign to zero and jump to the floatdisf code. */ + movi a7, 0 + j .Lfloatdisf_normalize + + .align 4 + .global __floatdisf + .type __floatdisf, @function +__floatdisf: + leaf_entry sp, 16 + + /* Check for zero. */ + or a4, xh, xl + beqz a4, 2f + + /* Save the sign. */ + extui a7, xh, 31, 1 + + /* Get the absolute value. */ + bgez xh, .Lfloatdisf_normalize + neg xl, xl + neg xh, xh + beqz xl, .Lfloatdisf_normalize + addi xh, xh, -1 + +.Lfloatdisf_normalize: + /* Normalize with the first 1 bit in the msb of xh. */ + beqz xh, .Lfloatdisf_bigshift + do_nsau a4, xh, a5, a6 + ssl a4 + src xh, xh, xl + sll xl, xl + +.Lfloatdisf_shifted: + /* Shift the mantissa into position, with rounding bits in a6. */ + ssai 8 + sll a5, xl + src a6, xh, xl + srl xh, xh + beqz a5, 1f + movi a5, 1 + or a6, a6, a5 +1: + /* Set the exponent. */ + movi a5, 0xbd /* 0x7e + 63 */ + sub a5, a5, a4 + slli a5, a5, 23 + add a2, xh, a5 + + /* Add the sign. */ + slli a7, a7, 31 + or a2, a2, a7 + + /* Round up if the leftover fraction is >= 1/2. */ + bgez a6, 2f + addi a2, a2, 1 /* Overflow to the exponent is OK. */ + + /* Check if the leftover fraction is exactly 1/2. */ + slli a6, a6, 1 + beqz a6, .Lfloatdisf_exactlyhalf +2: leaf_return + +.Lfloatdisf_bigshift: + /* xh is zero. Normalize with first 1 bit of xl in the msb of xh. */ + do_nsau a4, xl, a5, a6 + ssl a4 + sll xh, xl + movi xl, 0 + addi a4, a4, 32 + j .Lfloatdisf_shifted + +.Lfloatdisf_exactlyhalf: + /* Round down to the nearest even value. */ + srli a2, a2, 1 + slli a2, a2, 1 + leaf_return + +#endif /* L_floatdisf */ diff --git a/libgcc/config/xtensa/lib1funcs.S b/libgcc/config/xtensa/lib1funcs.S new file mode 100644 index 00000000000..071b9171177 --- /dev/null +++ b/libgcc/config/xtensa/lib1funcs.S @@ -0,0 +1,845 @@ +/* Assembly functions for the Xtensa version of libgcc1. + Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009 + Free Software Foundation, Inc. + Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +#include "xtensa-config.h" + +/* Define macros for the ABS and ADDX* instructions to handle cases + where they are not included in the Xtensa processor configuration. */ + + .macro do_abs dst, src, tmp +#if XCHAL_HAVE_ABS + abs \dst, \src +#else + neg \tmp, \src + movgez \tmp, \src, \src + mov \dst, \tmp +#endif + .endm + + .macro do_addx2 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx2 \dst, \as, \at +#else + slli \tmp, \as, 1 + add \dst, \tmp, \at +#endif + .endm + + .macro do_addx4 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx4 \dst, \as, \at +#else + slli \tmp, \as, 2 + add \dst, \tmp, \at +#endif + .endm + + .macro do_addx8 dst, as, at, tmp +#if XCHAL_HAVE_ADDX + addx8 \dst, \as, \at +#else + slli \tmp, \as, 3 + add \dst, \tmp, \at +#endif + .endm + +/* Define macros for leaf function entry and return, supporting either the + standard register windowed ABI or the non-windowed call0 ABI. These + macros do not allocate any extra stack space, so they only work for + leaf functions that do not need to spill anything to the stack. */ + + .macro leaf_entry reg, size +#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__ + entry \reg, \size +#else + /* do nothing */ +#endif + .endm + + .macro leaf_return +#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__ + retw +#else + ret +#endif + .endm + + +#ifdef L_mulsi3 + .align 4 + .global __mulsi3 + .type __mulsi3, @function +__mulsi3: + leaf_entry sp, 16 + +#if XCHAL_HAVE_MUL32 + mull a2, a2, a3 + +#elif XCHAL_HAVE_MUL16 + or a4, a2, a3 + srai a4, a4, 16 + bnez a4, .LMUL16 + mul16u a2, a2, a3 + leaf_return +.LMUL16: + srai a4, a2, 16 + srai a5, a3, 16 + mul16u a7, a4, a3 + mul16u a6, a5, a2 + mul16u a4, a2, a3 + add a7, a7, a6 + slli a7, a7, 16 + add a2, a7, a4 + +#elif XCHAL_HAVE_MAC16 + mul.aa.hl a2, a3 + mula.aa.lh a2, a3 + rsr a5, ACCLO + umul.aa.ll a2, a3 + rsr a4, ACCLO + slli a5, a5, 16 + add a2, a4, a5 + +#else /* !MUL32 && !MUL16 && !MAC16 */ + + /* Multiply one bit at a time, but unroll the loop 4x to better + exploit the addx instructions and avoid overhead. + Peel the first iteration to save a cycle on init. */ + + /* Avoid negative numbers. */ + xor a5, a2, a3 /* Top bit is 1 if one input is negative. */ + do_abs a3, a3, a6 + do_abs a2, a2, a6 + + /* Swap so the second argument is smaller. */ + sub a7, a2, a3 + mov a4, a3 + movgez a4, a2, a7 /* a4 = max (a2, a3) */ + movltz a3, a2, a7 /* a3 = min (a2, a3) */ + + movi a2, 0 + extui a6, a3, 0, 1 + movnez a2, a4, a6 + + do_addx2 a7, a4, a2, a7 + extui a6, a3, 1, 1 + movnez a2, a7, a6 + + do_addx4 a7, a4, a2, a7 + extui a6, a3, 2, 1 + movnez a2, a7, a6 + + do_addx8 a7, a4, a2, a7 + extui a6, a3, 3, 1 + movnez a2, a7, a6 + + bgeui a3, 16, .Lmult_main_loop + neg a3, a2 + movltz a2, a3, a5 + leaf_return + + .align 4 +.Lmult_main_loop: + srli a3, a3, 4 + slli a4, a4, 4 + + add a7, a4, a2 + extui a6, a3, 0, 1 + movnez a2, a7, a6 + + do_addx2 a7, a4, a2, a7 + extui a6, a3, 1, 1 + movnez a2, a7, a6 + + do_addx4 a7, a4, a2, a7 + extui a6, a3, 2, 1 + movnez a2, a7, a6 + + do_addx8 a7, a4, a2, a7 + extui a6, a3, 3, 1 + movnez a2, a7, a6 + + bgeui a3, 16, .Lmult_main_loop + + neg a3, a2 + movltz a2, a3, a5 + +#endif /* !MUL32 && !MUL16 && !MAC16 */ + + leaf_return + .size __mulsi3, . - __mulsi3 + +#endif /* L_mulsi3 */ + + +#ifdef L_umulsidi3 + +#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16 +#define XCHAL_NO_MUL 1 +#endif + + .align 4 + .global __umulsidi3 + .type __umulsidi3, @function +__umulsidi3: +#if __XTENSA_CALL0_ABI__ + leaf_entry sp, 32 + addi sp, sp, -32 + s32i a12, sp, 16 + s32i a13, sp, 20 + s32i a14, sp, 24 + s32i a15, sp, 28 +#elif XCHAL_NO_MUL + /* This is not really a leaf function; allocate enough stack space + to allow CALL12s to a helper function. */ + leaf_entry sp, 48 +#else + leaf_entry sp, 16 +#endif + +#ifdef __XTENSA_EB__ +#define wh a2 +#define wl a3 +#else +#define wh a3 +#define wl a2 +#endif /* __XTENSA_EB__ */ + + /* This code is taken from the mulsf3 routine in ieee754-sf.S. + See more comments there. */ + +#if XCHAL_HAVE_MUL32_HIGH + mull a6, a2, a3 + muluh wh, a2, a3 + mov wl, a6 + +#else /* ! MUL32_HIGH */ + +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* a0 and a8 will be clobbered by calling the multiply function + but a8 is not used here and need not be saved. */ + s32i a0, sp, 0 +#endif + +#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32 + +#define a2h a4 +#define a3h a5 + + /* Get the high halves of the inputs into registers. */ + srli a2h, a2, 16 + srli a3h, a3, 16 + +#define a2l a2 +#define a3l a3 + +#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16 + /* Clear the high halves of the inputs. This does not matter + for MUL16 because the high bits are ignored. */ + extui a2, a2, 0, 16 + extui a3, a3, 0, 16 +#endif +#endif /* MUL16 || MUL32 */ + + +#if XCHAL_HAVE_MUL16 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mul16u dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MUL32 + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + mull dst, xreg ## xhalf, yreg ## yhalf + +#elif XCHAL_HAVE_MAC16 + +/* The preprocessor insists on inserting a space when concatenating after + a period in the definition of do_mul below. These macros are a workaround + using underscores instead of periods when doing the concatenation. */ +#define umul_aa_ll umul.aa.ll +#define umul_aa_lh umul.aa.lh +#define umul_aa_hl umul.aa.hl +#define umul_aa_hh umul.aa.hh + +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + umul_aa_ ## xhalf ## yhalf xreg, yreg; \ + rsr dst, ACCLO + +#else /* no multiply hardware */ + +#define set_arg_l(dst, src) \ + extui dst, src, 0, 16 +#define set_arg_h(dst, src) \ + srli dst, src, 16 + +#if __XTENSA_CALL0_ABI__ +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a13, xreg); \ + set_arg_ ## yhalf (a14, yreg); \ + call0 .Lmul_mulsi3; \ + mov dst, a12 +#else +#define do_mul(dst, xreg, xhalf, yreg, yhalf) \ + set_arg_ ## xhalf (a14, xreg); \ + set_arg_ ## yhalf (a15, yreg); \ + call12 .Lmul_mulsi3; \ + mov dst, a14 +#endif /* __XTENSA_CALL0_ABI__ */ + +#endif /* no multiply hardware */ + + /* Add pp1 and pp2 into a6 with carry-out in a9. */ + do_mul(a6, a2, l, a3, h) /* pp 1 */ + do_mul(a11, a2, h, a3, l) /* pp 2 */ + movi a9, 0 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Shift the high half of a9/a6 into position in a9. Note that + this value can be safely incremented without any carry-outs. */ + ssai 16 + src a9, a9, a6 + + /* Compute the low word into a6. */ + do_mul(a11, a2, l, a3, l) /* pp 0 */ + sll a6, a6 + add a6, a6, a11 + bgeu a6, a11, 1f + addi a9, a9, 1 +1: + /* Compute the high word into wh. */ + do_mul(wh, a2, h, a3, h) /* pp 3 */ + add wh, wh, a9 + mov wl, a6 + +#endif /* !MUL32_HIGH */ + +#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL + /* Restore the original return address. */ + l32i a0, sp, 0 +#endif +#if __XTENSA_CALL0_ABI__ + l32i a12, sp, 16 + l32i a13, sp, 20 + l32i a14, sp, 24 + l32i a15, sp, 28 + addi sp, sp, 32 +#endif + leaf_return + +#if XCHAL_NO_MUL + + /* For Xtensa processors with no multiply hardware, this simplified + version of _mulsi3 is used for multiplying 16-bit chunks of + the floating-point mantissas. When using CALL0, this function + uses a custom ABI: the inputs are passed in a13 and a14, the + result is returned in a12, and a8 and a15 are clobbered. */ + .align 4 +.Lmul_mulsi3: + leaf_entry sp, 16 + .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2 + movi \dst, 0 +1: add \tmp1, \src2, \dst + extui \tmp2, \src1, 0, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx2 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 1, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx4 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 2, 1 + movnez \dst, \tmp1, \tmp2 + + do_addx8 \tmp1, \src2, \dst, \tmp1 + extui \tmp2, \src1, 3, 1 + movnez \dst, \tmp1, \tmp2 + + srli \src1, \src1, 4 + slli \src2, \src2, 4 + bnez \src1, 1b + .endm +#if __XTENSA_CALL0_ABI__ + mul_mulsi3_body a12, a13, a14, a15, a8 +#else + /* The result will be written into a2, so save that argument in a4. */ + mov a4, a2 + mul_mulsi3_body a2, a4, a3, a5, a6 +#endif + leaf_return +#endif /* XCHAL_NO_MUL */ + + .size __umulsidi3, . - __umulsidi3 + +#endif /* L_umulsidi3 */ + + +/* Define a macro for the NSAU (unsigned normalize shift amount) + instruction, which computes the number of leading zero bits, + to handle cases where it is not included in the Xtensa processor + configuration. */ + + .macro do_nsau cnt, val, tmp, a +#if XCHAL_HAVE_NSA + nsau \cnt, \val +#else + mov \a, \val + movi \cnt, 0 + extui \tmp, \a, 16, 16 + bnez \tmp, 0f + movi \cnt, 16 + slli \a, \a, 16 +0: + extui \tmp, \a, 24, 8 + bnez \tmp, 1f + addi \cnt, \cnt, 8 + slli \a, \a, 8 +1: + movi \tmp, __nsau_data + extui \a, \a, 24, 8 + add \tmp, \tmp, \a + l8ui \tmp, \tmp, 0 + add \cnt, \cnt, \tmp +#endif /* !XCHAL_HAVE_NSA */ + .endm + +#ifdef L_clz + .section .rodata + .align 4 + .global __nsau_data + .type __nsau_data, @object +__nsau_data: +#if !XCHAL_HAVE_NSA + .byte 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 + .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 + .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +#endif /* !XCHAL_HAVE_NSA */ + .size __nsau_data, . - __nsau_data + .hidden __nsau_data +#endif /* L_clz */ + + +#ifdef L_clzsi2 + .align 4 + .global __clzsi2 + .type __clzsi2, @function +__clzsi2: + leaf_entry sp, 16 + do_nsau a2, a2, a3, a4 + leaf_return + .size __clzsi2, . - __clzsi2 + +#endif /* L_clzsi2 */ + + +#ifdef L_ctzsi2 + .align 4 + .global __ctzsi2 + .type __ctzsi2, @function +__ctzsi2: + leaf_entry sp, 16 + neg a3, a2 + and a3, a3, a2 + do_nsau a2, a3, a4, a5 + neg a2, a2 + addi a2, a2, 31 + leaf_return + .size __ctzsi2, . - __ctzsi2 + +#endif /* L_ctzsi2 */ + + +#ifdef L_ffssi2 + .align 4 + .global __ffssi2 + .type __ffssi2, @function +__ffssi2: + leaf_entry sp, 16 + neg a3, a2 + and a3, a3, a2 + do_nsau a2, a3, a4, a5 + neg a2, a2 + addi a2, a2, 32 + leaf_return + .size __ffssi2, . - __ffssi2 + +#endif /* L_ffssi2 */ + + +#ifdef L_udivsi3 + .align 4 + .global __udivsi3 + .type __udivsi3, @function +__udivsi3: + leaf_entry sp, 16 +#if XCHAL_HAVE_DIV32 + quou a2, a2, a3 +#else + bltui a3, 2, .Lle_one /* check if the divisor <= 1 */ + + mov a6, a2 /* keep dividend in a6 */ + do_nsau a5, a6, a2, a7 /* dividend_shift = nsau (dividend) */ + do_nsau a4, a3, a2, a7 /* divisor_shift = nsau (divisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = divisor_shift - dividend_shift */ + ssl a4 + sll a3, a3 /* divisor <<= count */ + movi a2, 0 /* quotient = 0 */ + + /* test-subtract-and-shift loop; one quotient bit on each iteration */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a6, a3, .Lzerobit + sub a6, a6, a3 + addi a2, a2, 1 +.Lzerobit: + slli a2, a2, 1 + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + + bltu a6, a3, .Lreturn + addi a2, a2, 1 /* increment quotient if dividend >= divisor */ +.Lreturn: + leaf_return + +.Lle_one: + beqz a3, .Lerror /* if divisor == 1, return the dividend */ + leaf_return + +.Lspecial: + /* return dividend >= divisor */ + bltu a6, a3, .Lreturn0 + movi a2, 1 + leaf_return + +.Lerror: + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + leaf_return + .size __udivsi3, . - __udivsi3 + +#endif /* L_udivsi3 */ + + +#ifdef L_divsi3 + .align 4 + .global __divsi3 + .type __divsi3, @function +__divsi3: + leaf_entry sp, 16 +#if XCHAL_HAVE_DIV32 + quos a2, a2, a3 +#else + xor a7, a2, a3 /* sign = dividend ^ divisor */ + do_abs a6, a2, a4 /* udividend = abs (dividend) */ + do_abs a3, a3, a4 /* udivisor = abs (divisor) */ + bltui a3, 2, .Lle_one /* check if udivisor <= 1 */ + do_nsau a5, a6, a2, a8 /* udividend_shift = nsau (udividend) */ + do_nsau a4, a3, a2, a8 /* udivisor_shift = nsau (udivisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */ + ssl a4 + sll a3, a3 /* udivisor <<= count */ + movi a2, 0 /* quotient = 0 */ + + /* test-subtract-and-shift loop; one quotient bit on each iteration */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a6, a3, .Lzerobit + sub a6, a6, a3 + addi a2, a2, 1 +.Lzerobit: + slli a2, a2, 1 + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + + bltu a6, a3, .Lreturn + addi a2, a2, 1 /* increment if udividend >= udivisor */ +.Lreturn: + neg a5, a2 + movltz a2, a5, a7 /* return (sign < 0) ? -quotient : quotient */ + leaf_return + +.Lle_one: + beqz a3, .Lerror + neg a2, a6 /* if udivisor == 1, then return... */ + movgez a2, a6, a7 /* (sign < 0) ? -udividend : udividend */ + leaf_return + +.Lspecial: + bltu a6, a3, .Lreturn0 /* if dividend < divisor, return 0 */ + movi a2, 1 + movi a4, -1 + movltz a2, a4, a7 /* else return (sign < 0) ? -1 : 1 */ + leaf_return + +.Lerror: + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + leaf_return + .size __divsi3, . - __divsi3 + +#endif /* L_divsi3 */ + + +#ifdef L_umodsi3 + .align 4 + .global __umodsi3 + .type __umodsi3, @function +__umodsi3: + leaf_entry sp, 16 +#if XCHAL_HAVE_DIV32 + remu a2, a2, a3 +#else + bltui a3, 2, .Lle_one /* check if the divisor is <= 1 */ + + do_nsau a5, a2, a6, a7 /* dividend_shift = nsau (dividend) */ + do_nsau a4, a3, a6, a7 /* divisor_shift = nsau (divisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = divisor_shift - dividend_shift */ + ssl a4 + sll a3, a3 /* divisor <<= count */ + + /* test-subtract-and-shift loop */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a2, a3, .Lzerobit + sub a2, a2, a3 +.Lzerobit: + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + +.Lspecial: + bltu a2, a3, .Lreturn + sub a2, a2, a3 /* subtract once more if dividend >= divisor */ +.Lreturn: + leaf_return + +.Lle_one: + bnez a3, .Lreturn0 + + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + leaf_return + .size __umodsi3, . - __umodsi3 + +#endif /* L_umodsi3 */ + + +#ifdef L_modsi3 + .align 4 + .global __modsi3 + .type __modsi3, @function +__modsi3: + leaf_entry sp, 16 +#if XCHAL_HAVE_DIV32 + rems a2, a2, a3 +#else + mov a7, a2 /* save original (signed) dividend */ + do_abs a2, a2, a4 /* udividend = abs (dividend) */ + do_abs a3, a3, a4 /* udivisor = abs (divisor) */ + bltui a3, 2, .Lle_one /* check if udivisor <= 1 */ + do_nsau a5, a2, a6, a8 /* udividend_shift = nsau (udividend) */ + do_nsau a4, a3, a6, a8 /* udivisor_shift = nsau (udivisor) */ + bgeu a5, a4, .Lspecial + + sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */ + ssl a4 + sll a3, a3 /* udivisor <<= count */ + + /* test-subtract-and-shift loop */ +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lloopend +#endif /* XCHAL_HAVE_LOOPS */ +.Lloop: + bltu a2, a3, .Lzerobit + sub a2, a2, a3 +.Lzerobit: + srli a3, a3, 1 +#if !XCHAL_HAVE_LOOPS + addi a4, a4, -1 + bnez a4, .Lloop +#endif /* !XCHAL_HAVE_LOOPS */ +.Lloopend: + +.Lspecial: + bltu a2, a3, .Lreturn + sub a2, a2, a3 /* subtract again if udividend >= udivisor */ +.Lreturn: + bgez a7, .Lpositive + neg a2, a2 /* if (dividend < 0), return -udividend */ +.Lpositive: + leaf_return + +.Lle_one: + bnez a3, .Lreturn0 + + /* Divide by zero: Use an illegal instruction to force an exception. + The subsequent "DIV0" string can be recognized by the exception + handler to identify the real cause of the exception. */ + ill + .ascii "DIV0" + +.Lreturn0: + movi a2, 0 +#endif /* XCHAL_HAVE_DIV32 */ + leaf_return + .size __modsi3, . - __modsi3 + +#endif /* L_modsi3 */ + + +#ifdef __XTENSA_EB__ +#define uh a2 +#define ul a3 +#else +#define uh a3 +#define ul a2 +#endif /* __XTENSA_EB__ */ + + +#ifdef L_ashldi3 + .align 4 + .global __ashldi3 + .type __ashldi3, @function +__ashldi3: + leaf_entry sp, 16 + ssl a4 + bgei a4, 32, .Llow_only + src uh, uh, ul + sll ul, ul + leaf_return + +.Llow_only: + sll uh, ul + movi ul, 0 + leaf_return + .size __ashldi3, . - __ashldi3 + +#endif /* L_ashldi3 */ + + +#ifdef L_ashrdi3 + .align 4 + .global __ashrdi3 + .type __ashrdi3, @function +__ashrdi3: + leaf_entry sp, 16 + ssr a4 + bgei a4, 32, .Lhigh_only + src ul, uh, ul + sra uh, uh + leaf_return + +.Lhigh_only: + sra ul, uh + srai uh, uh, 31 + leaf_return + .size __ashrdi3, . - __ashrdi3 + +#endif /* L_ashrdi3 */ + + +#ifdef L_lshrdi3 + .align 4 + .global __lshrdi3 + .type __lshrdi3, @function +__lshrdi3: + leaf_entry sp, 16 + ssr a4 + bgei a4, 32, .Lhigh_only1 + src ul, uh, ul + srl uh, uh + leaf_return + +.Lhigh_only1: + srl ul, uh + movi uh, 0 + leaf_return + .size __lshrdi3, . - __lshrdi3 + +#endif /* L_lshrdi3 */ + + +#include "ieee754-df.S" +#include "ieee754-sf.S" diff --git a/libgcc/config/xtensa/t-xtensa b/libgcc/config/xtensa/t-xtensa index 7d9e9db0487..5bcc0946243 100644 --- a/libgcc/config/xtensa/t-xtensa +++ b/libgcc/config/xtensa/t-xtensa @@ -1,2 +1,14 @@ +LIB1ASMSRC = xtensa/lib1funcs.S +LIB1ASMFUNCS = _mulsi3 _divsi3 _modsi3 _udivsi3 _umodsi3 \ + _umulsidi3 _clz _clzsi2 _ctzsi2 _ffssi2 \ + _ashldi3 _ashrdi3 _lshrdi3 \ + _negsf2 _addsubsf3 _mulsf3 _divsf3 _cmpsf2 _fixsfsi _fixsfdi \ + _fixunssfsi _fixunssfdi _floatsisf _floatunsisf \ + _floatdisf _floatundisf \ + _negdf2 _addsubdf3 _muldf3 _divdf3 _cmpdf2 _fixdfsi _fixdfdi \ + _fixunsdfsi _fixunsdfdi _floatsidf _floatunsidf \ + _floatdidf _floatundidf \ + _truncdfsf2 _extendsfdf2 + LIB2ADDEH = $(srcdir)/config/xtensa/unwind-dw2-xtensa.c \ $(srcdir)/unwind-dw2-fde.c $(srcdir)/unwind-sjlj.c $(srcdir)/unwind-c.c |