Move libgcc1 to toplevel libgcc

gcc: * Makefile.in (LIB1ASMSRC): Don't export. (libgcc.mvars): Don't emit LIB1ASMFUNCS, LIB1ASMSRC. * config/arm/arm.c: Update lib1funcs.asm filename. * config/arm/linux-eabi.h: Likewise. * config/arm/bpabi-v6m.S, config/arm/bpabi.S, config/arm/ieee754-df.S, config/arm/ieee754-sf.S: Move to ../libgcc/config/arm. * config/arm/lib1funcs.asm: Move to ../libgcc/config/arm/lib1funcs.S. * config/arm/t-arm (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/arm/t-arm-elf (LIB1ASMFUNCS): Remove. * config/arm/t-bpabi: Likewise. * config/arm/t-linux (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/arm/t-linux-eabi (LIB1ASMFUNCS): Remove. * config/arm/t-strongarm-elf: Likewise. * config/arm/t-symbian: Likewise. * config/arm/t-vxworks: Likewise. * config/arm/t-wince-pe: Likewise. * config/avr/libgcc.S: Move to ../libgcc/config/avr. * config/avr/t-avr (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/bfin/lib1funcs.asm: Move to ../libgcc/config/bfin/lib1funcs.S. * config/bfin/t-bfin: Remove. * config/bfin/t-bfin-elf (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/bfin/t-bfin-linux: Likewise. * config/bfin/t-bfin-uclinux: Likewise. * config/c6x/lib1funcs.asm: Move to ../libgcc/config/c6x/lib1funcs.S. * config/c6x/t-c6x-elf (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/fr30/lib1funcs.asm: Move to ../libgcc/config/fr30/lib1funcs.S. * config/fr30/t-fr30 (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/frv/lib1funcs.asm: Move to ../libgcc/config/frv/lib1funcs.S. * config/frv/t-frv (CROSS_LIBGCC1, LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/h8300/fixunssfsi.c: Update lib1funcs.asm filename. * config/h8300/lib1funcs.asm: Move to ../libgcc/config/h8300/lib1funcs.S. * config/h8300/t-h8300 (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/i386/cygwin.asm: Move to ../libgcc/config/i386/cygwin.S. * config/i386/t-cygming (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/i386/t-interix: Likewise. * config/ia64/lib1funcs.asm: Move to ../libgcc/config/ia64/lib1funcs.S. * config/ia64/t-hpux (LIB1ASMFUNCS, LIBGCC1_TEST): Remove. * config/ia64/t-ia64 (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/iq2000/t-iq2000 (LIBGCC1, CROSS_LIBGCC1): Remove. * config/m32c/m32c.c: Update m32c-lib1.S filename. * config/m32c/m32c-lib1.S: Move to ../libgcc/config/m32c/lib1funcs.S. * config/m32c/t-m32c (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/m32r/t-linux (CROSS_LIBGCC1, LIBGCC1, LIBGCC1_TEST): Remove. * config/m68k/lb1sf68.asm: Move to ../libgcc/config/m68k/lb1sf68.S. * config/m68k/t-floatlib (LIB1ASMSRC, LIB1ASMFUNCS): New file. * config/mcore/lib1.asm: Move to ../libgcc/config/mcore/lib1funcs.S. * config/mcore/t-mcore (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/mep/mep-lib1.asm: Move to ../libgcc/config/mep/lib1funcs.S. * config/mep/t-mep (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/mips/mips16.S: Move to ../libgcc/config/mips. * config/mips/t-libgcc-mips16: Remove. * config/mips/t-sr71k (LIBGCC1, CROSS_LIBGCC1): Remove. * config/pa/milli64.S: Move to ../libgcc/config/pa. * config/pa/t-linux (LIB1ASMFUNCS, LIB1ASMSRC): Remove. * config/pa/t-linux64: Likewise. * config/picochip/libgccExtras/fake_libgcc.asm: Move to ../libgcc/config/picochip/lib1funcs.S. * config/picochip/t-picochip (LIB1ASMFUNCS, LIB1ASMSRC): Remove. * config/sh/lib1funcs.asm: Move to ../libgcc/config/sh/lib1funcs.S. * config/sh/lib1funcs.h: Move to ../libgcc/config/sh. * config/sh/sh.h: Update lib1funcs.asm filename. * config/sh/t-linux (LIB1ASMFUNCS_CACHE): Remove. * config/sh/t-netbsd: Likewise. * config/sh/t-sh (LIB1ASMSRC, LIB1ASMFUNCS, LIB1ASMFUNCS_CACHE): Remove. * config/sh/t-sh64 (LIB1ASMFUNCS): Remove. * config/sparc/lb1spc.asm: Move to ../libgcc/config/sparc/lb1spc.S. * config/sparc/lb1spl.asm: Remove. * config/sparc/t-elf (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config/sparc/t-leon: Likewise. * config/spu/t-spu-elf (LIBGCC1, CROSS_LIBGCC1): Remove. * config/v850/lib1funcs.asm: Move to ../libgcc/config/v850/lib1funcs.S. * config/v850/t-v850 (LIB1ASMSRC, LIB1ASMFUNCS): Remove * config/vax/lib1funcs.asm: Move to ../libgcc/config/vax/lib1funcs.S. * config/vax/t-linux: Remove. * config/xtensa/ieee754-df.S, config/xtensa/ieee754-sf.S: Move to ../libgcc/config/xtensa. * config/xtensa/lib1funcs.asm: Move to ../libgcc/config/xtensa/lib1funcs.S. * config/xtensa/t-xtensa (LIB1ASMSRC, LIB1ASMFUNCS): Remove. * config.gcc (bfin*-rtems*): Remove bfin/t-bfin from tmake_file. (bfin*-*): Likewise. (mips64*-*-linux*, mipsisa64*-*-linux*): Remove mips/t-libgcc-mips16 from tmake_file. (mips*-*-linux*): Likewise. (mips*-sde-elf*): Likewise. (mipsisa32-*-elf*, mipsisa32el-*-elf*, mipsisa32r2-*-elf*) (mipsisa32r2el-*-elf*, mipsisa64-*-elf*, mipsisa64el-*-elf*) (mipsisa64r2-*-elf*, mipsisa64r2el-*-elf*): Likewise. (mipsisa64sb1-*-elf*, mipsisa64sb1el-*-elf*): Likewise. (mips-*-elf*, mipsel-*-elf*): Likewise. (mips64-*-elf*, mips64el-*-elf*): Likewise. (mips64orion-*-elf*, mips64orionel-*-elf*): Likewise. (mips*-*-rtems*): Likewise. (mipstx39-*-elf*, mipstx39el-*-elf*): Likewise. (vax-*-linux*): Remove vax/t-linux from tmake_file. libgcc: * Makefile.in ($(lib1asmfuncs-o), $(lib1asmfuncs-s-o)): Use $(srcdir) to refer to $(LIB1ASMSRC). Use $<. * config/arm/bpabi-v6m.S, config/arm/bpabi.S, config/arm/ieee754-df.S, config/arm/ieee754-sf.S, config/arm/lib1funcs.S: New files. * config/arm/libunwind.S [!__symbian__]: Use lib1funcs.S. * config/arm/t-arm: New file. * config/arm/t-bpabi (LIB1ASMFUNCS): Set. * config/arm/t-elf, config/arm/t-linux, config/arm/t-linux-eabi, config/arm/t-strongarm-elf: New files. * config/arm/t-symbian (LIB1ASMFUNCS): Set. * config/arm/t-vxworks, config/arm/t-wince-pe: New files. * config/avr/lib1funcs.S: New file. * config/avr/t-avr (LIB1ASMSRC, LIB1ASMFUNCS): Set. * config/bfin/lib1funcs.S, config/bfin/t-bfin: New files. * config/c6x/lib1funcs.S: New file. * config/c6x/t-elf (LIB1ASMSRC, LIB1ASMFUNCS): Set. * config/fr30/lib1funcs.S, config/fr30/t-fr30: New files. * config/frv/lib1funcs.S: New file. * config/frv/t-frv (LIB1ASMSRC, LIB1ASMFUNCS): Set. * config/h8300/lib1funcs.S, config/h8300/t-h8300: New files. * config/i386/cygwin.S, config/i386/t-chkstk: New files. * config/ia64/__divxf3.asm: Rename to ... * config/ia64/__divxf3.S: ... this. Adapt lib1funcs.asm filename. * config/ia64/_fixtfdi.asm: Rename to ... * config/ia64/_fixtfdi.S: ... this. Adapt lib1funcs.asm filename. * config/ia64/_fixunstfdi.asm: Rename to ... * config/ia64/_fixunstfdi.S: ... this. Adapt lib1funcs.asm filename. * config/ia64/_floatditf.asm: Rename to ... * config/ia64/_floatditf.S: ... this. Adapt lib1funcs.asm filename. * config/ia64/lib1funcs.S: New file. * config/ia64/t-hpux (LIB1ASMFUNCS): Set. * config/ia64/t-ia64 (LIB1ASMSRC, LIB1ASMFUNCS): Set. * config/ia64/t-softfp-compat (libgcc1-tf-compats): Adapt suffix. * config/m32c/lib1funcs.S, config/m32c/t-m32c: New files. * config/m68k/lb1sf68.S, config/m68k/t-floatlib: New files. * config/mcore/lib1funcs.S, config/mcore/t-mcore: New files. * config/mep/lib1funcs.S: New file. * config/mep/t-mep (LIB1ASMSRC, LIB1ASMFUNCS): Set. * config/mips/mips16.S: New file. * config/mips/t-mips16 (LIB1ASMSRC, LIB1ASMFUNCS): Set. * config/pa/milli64.S: New file. * config/pa/t-linux, config/pa/t-linux64: New files. * config/picochip/lib1funcs.S: New file. * config/picochip/t-picochip (LIB1ASMSRC, LIB1ASMFUNCS): Set. * config/sh/lib1funcs.S, config/sh/lib1funcs.h: New files. * config/sh/t-linux (LIB1ASMFUNCS_CACHE): Set. * config/sh/t-netbsd: New file. * config/sh/t-sh (LIB1ASMSRC, LIB1ASMFUNCS, LIB1ASMFUNCS_CACHE): Set. Use $(srcdir) to refer to lib1funcs.S, adapt filename. * config/sh/t-sh64: New file. * config/sparc/lb1spc.S: New file. * config/sparc/t-softmul (LIB1ASMSRC): Adapt sparc/lb1spc.asm filename. * config/v850/lib1funcs.S, config/v850/t-v850: New files. * config/vax/lib1funcs.S, config/vax/t-linux: New files. * config/xtensa/ieee754-df.S, config/xtensa/ieee754-sf.S, config/xtensa/lib1funcs.S: New files. * config/xtensa/t-xtensa (LIB1ASMSRC, LIB1ASMFUNCS): Set. * config.host (arm-wrs-vxworks): Add arm/t-arm, arm/t-vxworks to tmake_file. (arm*-*-freebsd*): Add arm/t-arm, arm/t-strongarm-elf to tmake_file. (arm*-*-netbsdelf*): Add arm/t-arm to tmake_file. (arm*-*-linux*): Likewise. Add arm/t-elf, arm/t-bpabi, arm/t-linux-eabi to tmake_file for arm*-*-linux-*eabi, add arm/t-linux otherwise. (arm*-*-uclinux*): Add arm/t-arm, arm/t-elf to tmake_file. (arm*-*-ecos-elf): Likewise. (arm*-*-eabi*, arm*-*-symbianelf*): Likewise. (arm*-*-rtems*): Likewise. (arm*-*-elf): Likewise. (arm*-wince-pe*): Add arm/t-arm, arm/t-wince-pe to tmake_file. (avr-*-rtems*): Add to tmake_file, add avr/t-avr. (bfin*-elf*): Add bfin/t-bfin to tmake_file. (bfin*-uclinux*): Likewise. (bfin*-linux-uclibc*): Likewise. (bfin*-rtems*): Likewise. (bfin*-*): Likewise. (fido-*-elf): Merge into m68k-*-elf*. (fr30-*-elf)): Add fr30/t-fr30 to tmake_file. (frv-*-*linux*): Add frv/t-frv to tmake_file. (h8300-*-rtems*): Add h8300/t-h8300 to tmake_file. (h8300-*-elf*): Likewise. (hppa*64*-*-linux*): Add pa/t-linux, pa/t-linux64 to tmake_file. (hppa*-*-linux*): Add pa/t-linux to tmake_file. (i[34567]86-*-cygwin*): Add i386/t-chkstk to tmake_file. (i[34567]86-*-mingw*): Likewise. (x86_64-*-mingw*): Likewise. (i[34567]86-*-interix3*): Likewise. (ia64*-*-hpux*): Add ia64/t-ia64, ia64/t-hpux to tmake_file. (ia64-hp-*vms*): Add ia64/t-ia64 to tmake_file. (m68k-*-elf*): Also handle fido-*-elf. Add m68k/t-floatlib to tmake_file. (m68k-*-uclinux*): Add m68k/t-floatlib to tmake_file. (m68k-*-linux*): Likewise. (m68k-*-rtems*): Likewise. (mcore-*-elf): Add mcore/t-mcore to tmake_file. (sh-*-elf*, sh[12346l]*-*-elf*): Add sh/t-sh64 to tmake_file for sh64*-*-*. (sh-*-linux*, sh[2346lbe]*-*-linux*): Add sh/t-sh to tmake_file. Add sh/t-sh64 to tmake_file for sh64*-*-linux*. (sh-*-netbsdelf*, shl*-*-netbsdelf*, sh5-*-netbsd*) (sh5l*-*-netbsd*, sh64-*-netbsd*, sh64l*-*-netbsd*): Add sh/t-sh, sh/t-netbsd to tmake_file. Add sh/t-sh64 to tmake_file for sh5*-*-netbsd*, sh64*-netbsd*. (sh-*-rtems*): Add sh/t-sh to tmake_file. (sh-wrs-vxworks): Likewise. (sparc-*-linux*): Add sparc/t-softmul to tmake_file except for *-leon[3-9]*. (v850*-*-*): Add v850/t-v850 to tmake_file. (vax-*-linux*): Add vax/t-linux to tmake_file. (m32c-*-elf*, m32c-*-rtems*): Add m32c/t-m32c to tmake_file. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@180773 138bc75d-0d04-0410-961f-82ee72b054a4
author: ro <ro@138bc75d-0d04-0410-961f-82ee72b054a4> 2011-11-02 15:03:19 +0000
committer: ro <ro@138bc75d-0d04-0410-961f-82ee72b054a4> 2011-11-02 15:03:19 +0000
commit: 9213d2eb44a8b9bcc432b57e246d9b52d5bdc949 (patch)
tree: bfbde9a54f663fb7556b9dacd07709ef97c1961c /libgcc
parent: 237490bf10db39b859bd28598ff64f1bd2c84421 (diff)
download: ppe42-gcc-9213d2eb44a8b9bcc432b57e246d9b52d5bdc949.tar.gz
ppe42-gcc-9213d2eb44a8b9bcc432b57e246d9b52d5bdc949.zip
72 files changed, 29534 insertions, 55 deletions
diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog
index b5d9c243a98..6b2514aba9a 100644
--- a/libgcc/ChangeLog
+++ b/libgcc/ChangeLog
@@ -1,5 +1,125 @@
 2011-11-02  Rainer Orth  <ro@CeBiTec.Uni-Bielefeld.DE>
 
+	* Makefile.in ($(lib1asmfuncs-o), $(lib1asmfuncs-s-o)): Use
+	$(srcdir) to refer to $(LIB1ASMSRC).
+	Use $<.
+	* config/arm/bpabi-v6m.S, config/arm/bpabi.S,
+	config/arm/ieee754-df.S, config/arm/ieee754-sf.S,
+	config/arm/lib1funcs.S: New files.
+	* config/arm/libunwind.S [!__symbian__]: Use lib1funcs.S.
+	* config/arm/t-arm: New file.
+	* config/arm/t-bpabi (LIB1ASMFUNCS): Set.
+	* config/arm/t-elf, config/arm/t-linux, config/arm/t-linux-eabi,
+	config/arm/t-strongarm-elf: New files.
+	* config/arm/t-symbian (LIB1ASMFUNCS): Set.
+	* config/arm/t-vxworks, config/arm/t-wince-pe: New files.
+	* config/avr/lib1funcs.S: New file.
+	* config/avr/t-avr (LIB1ASMSRC, LIB1ASMFUNCS): Set.
+	* config/bfin/lib1funcs.S, config/bfin/t-bfin: New files.
+	* config/c6x/lib1funcs.S: New file.
+	* config/c6x/t-elf (LIB1ASMSRC, LIB1ASMFUNCS): Set.
+	* config/fr30/lib1funcs.S, config/fr30/t-fr30: New files.
+	* config/frv/lib1funcs.S: New file.
+	* config/frv/t-frv (LIB1ASMSRC, LIB1ASMFUNCS): Set.
+	* config/h8300/lib1funcs.S, config/h8300/t-h8300: New files.
+	* config/i386/cygwin.S, config/i386/t-chkstk: New files.
+	* config/ia64/__divxf3.asm: Rename to ...
+	* config/ia64/__divxf3.S: ... this.
+	Adapt lib1funcs.asm filename.
+	* config/ia64/_fixtfdi.asm: Rename to ...
+	* config/ia64/_fixtfdi.S: ... this.
+	Adapt lib1funcs.asm filename.
+	* config/ia64/_fixunstfdi.asm: Rename to ...
+	* config/ia64/_fixunstfdi.S: ... this.
+	Adapt lib1funcs.asm filename.
+	* config/ia64/_floatditf.asm: Rename to ...
+	* config/ia64/_floatditf.S: ... this.
+	Adapt lib1funcs.asm filename.
+	* config/ia64/lib1funcs.S: New file.
+	* config/ia64/t-hpux (LIB1ASMFUNCS): Set.
+	* config/ia64/t-ia64 (LIB1ASMSRC, LIB1ASMFUNCS): Set.
+	* config/ia64/t-softfp-compat (libgcc1-tf-compats): Adapt suffix.
+	* config/m32c/lib1funcs.S, config/m32c/t-m32c: New files.
+	* config/m68k/lb1sf68.S, config/m68k/t-floatlib: New files.
+	* config/mcore/lib1funcs.S, config/mcore/t-mcore: New files.
+	* config/mep/lib1funcs.S: New file.
+	* config/mep/t-mep (LIB1ASMSRC, LIB1ASMFUNCS): Set.
+	* config/mips/mips16.S: New file.
+	* config/mips/t-mips16 (LIB1ASMSRC, LIB1ASMFUNCS): Set.
+	* config/pa/milli64.S: New file.
+	* config/pa/t-linux, config/pa/t-linux64: New files.
+	* config/picochip/lib1funcs.S: New file.
+	* config/picochip/t-picochip (LIB1ASMSRC, LIB1ASMFUNCS): Set.
+	* config/sh/lib1funcs.S, config/sh/lib1funcs.h: New files.
+	* config/sh/t-linux (LIB1ASMFUNCS_CACHE): Set.
+	* config/sh/t-netbsd: New file.
+	* config/sh/t-sh (LIB1ASMSRC, LIB1ASMFUNCS, LIB1ASMFUNCS_CACHE): Set.
+	Use $(srcdir) to refer to lib1funcs.S, adapt filename.
+	* config/sh/t-sh64: New file.
+	* config/sparc/lb1spc.S: New file.
+	* config/sparc/t-softmul (LIB1ASMSRC): Adapt sparc/lb1spc.asm
+	filename.
+	* config/v850/lib1funcs.S, config/v850/t-v850: New files.
+	* config/vax/lib1funcs.S, config/vax/t-linux: New files.
+	* config/xtensa/ieee754-df.S, config/xtensa/ieee754-sf.S,
+	config/xtensa/lib1funcs.S: New files.
+	* config/xtensa/t-xtensa (LIB1ASMSRC, LIB1ASMFUNCS): Set.
+	* config.host (arm-wrs-vxworks): Add arm/t-arm, arm/t-vxworks to
+	tmake_file.
+	(arm*-*-freebsd*): Add arm/t-arm, arm/t-strongarm-elf to tmake_file.
+	(arm*-*-netbsdelf*): Add arm/t-arm to tmake_file.
+	(arm*-*-linux*): Likewise.
+	Add arm/t-elf, arm/t-bpabi, arm/t-linux-eabi to tmake_file for
+	arm*-*-linux-*eabi, add arm/t-linux otherwise.
+	(arm*-*-uclinux*): Add arm/t-arm, arm/t-elf to tmake_file.
+	(arm*-*-ecos-elf): Likewise.
+	(arm*-*-eabi*, arm*-*-symbianelf*): Likewise.
+	(arm*-*-rtems*): Likewise.
+	(arm*-*-elf): Likewise.
+	(arm*-wince-pe*): Add arm/t-arm, arm/t-wince-pe to tmake_file.
+	(avr-*-rtems*): Add to tmake_file, add avr/t-avr.
+	(bfin*-elf*): Add bfin/t-bfin to tmake_file.
+	(bfin*-uclinux*): Likewise.
+	(bfin*-linux-uclibc*): Likewise.
+	(bfin*-rtems*): Likewise.
+	(bfin*-*): Likewise.
+	(fido-*-elf): Merge into m68k-*-elf*.
+	(fr30-*-elf)): Add fr30/t-fr30 to tmake_file.
+	(frv-*-*linux*): Add frv/t-frv to tmake_file.
+	(h8300-*-rtems*): Add h8300/t-h8300 to tmake_file.
+	(h8300-*-elf*): Likewise.
+	(hppa*64*-*-linux*): Add pa/t-linux, pa/t-linux64 to tmake_file.
+	(hppa*-*-linux*): Add pa/t-linux to tmake_file.
+	(i[34567]86-*-cygwin*): Add i386/t-chkstk to tmake_file.
+	(i[34567]86-*-mingw*): Likewise.
+	(x86_64-*-mingw*): Likewise.
+	(i[34567]86-*-interix3*): Likewise.
+	(ia64*-*-hpux*): Add ia64/t-ia64, ia64/t-hpux to tmake_file.
+	(ia64-hp-*vms*): Add ia64/t-ia64 to tmake_file.
+	(m68k-*-elf*): Also handle fido-*-elf.
+	Add m68k/t-floatlib to tmake_file.
+	(m68k-*-uclinux*): Add m68k/t-floatlib to tmake_file.
+	(m68k-*-linux*): Likewise.
+	(m68k-*-rtems*): Likewise.
+	(mcore-*-elf): Add mcore/t-mcore to tmake_file.
+	(sh-*-elf*, sh[12346l]*-*-elf*): Add sh/t-sh64 to tmake_file for
+	sh64*-*-*.
+	(sh-*-linux*, sh[2346lbe]*-*-linux*): Add sh/t-sh to tmake_file.
+	Add sh/t-sh64 to tmake_file for sh64*-*-linux*.
+	(sh-*-netbsdelf*, shl*-*-netbsdelf*, sh5-*-netbsd*)
+	(sh5l*-*-netbsd*, sh64-*-netbsd*, sh64l*-*-netbsd*): Add sh/t-sh,
+	sh/t-netbsd to tmake_file.
+	Add sh/t-sh64 to tmake_file for sh5*-*-netbsd*, sh64*-netbsd*.
+	(sh-*-rtems*): Add sh/t-sh to tmake_file.
+	(sh-wrs-vxworks): Likewise.
+	(sparc-*-linux*): Add sparc/t-softmul to tmake_file except for
+	*-leon[3-9]*.
+	(v850*-*-*): Add v850/t-v850 to tmake_file.
+	(vax-*-linux*): Add vax/t-linux to tmake_file.
+	(m32c-*-elf*, m32c-*-rtems*): Add m32c/t-m32c to tmake_file.
+
+2011-11-02  Rainer Orth  <ro@CeBiTec.Uni-Bielefeld.DE>
+
 	* crtstuff.c: New file.
 	* Makefile.in (CRTSTUFF_CFLAGS): Define.
 	(CRTSTUFF_T_CFLAGS): Define.
diff --git a/libgcc/Makefile.in b/libgcc/Makefile.in
index 467901b057a..6bbb369f8e8 100644
--- a/libgcc/Makefile.in
+++ b/libgcc/Makefile.in
@@ -394,25 +394,22 @@ LIB2_DIVMOD_FUNCS := $(filter-out $(LIB2FUNCS_EXCLUDE) $(LIB1ASMFUNCS), \
 ifeq ($(enable_shared),yes)
 
 lib1asmfuncs-o = $(patsubst %,%$(objext),$(LIB1ASMFUNCS))
-$(lib1asmfuncs-o): %$(objext): $(gcc_srcdir)/config/$(LIB1ASMSRC) %.vis
-	$(gcc_compile) -DL$* -xassembler-with-cpp \
-	  -c $(gcc_srcdir)/config/$(LIB1ASMSRC) -include $*.vis
+$(lib1asmfuncs-o): %$(objext): $(srcdir)/config/$(LIB1ASMSRC) %.vis
+	$(gcc_compile) -DL$* -xassembler-with-cpp -c $< -include $*.vis
 $(patsubst %,%.vis,$(LIB1ASMFUNCS)): %.vis: %_s$(objext)
 	$(gen-hide-list)
 libgcc-objects += $(lib1asmfuncs-o)
 
 lib1asmfuncs-s-o = $(patsubst %,%_s$(objext),$(LIB1ASMFUNCS))
-$(lib1asmfuncs-s-o): %_s$(objext): $(gcc_srcdir)/config/$(LIB1ASMSRC)
-	$(gcc_s_compile) -DL$* -xassembler-with-cpp \
-	  -c $(gcc_srcdir)/config/$(LIB1ASMSRC)
+$(lib1asmfuncs-s-o): %_s$(objext): $(srcdir)/config/$(LIB1ASMSRC)
+	$(gcc_s_compile) -DL$* -xassembler-with-cpp -c $<
 libgcc-s-objects += $(lib1asmfuncs-s-o)
 
 else
 
 lib1asmfuncs-o = $(patsubst %,%$(objext),$(LIB1ASMFUNCS))
-$(lib1asmfuncs-o): %$(objext): $(gcc_srcdir)/config/$(LIB1ASMSRC)
-	$(gcc_compile) -DL$* -xassembler-with-cpp \
-	  -c $(gcc_srcdir)/config/$(LIB1ASMSRC)
+$(lib1asmfuncs-o): %$(objext): $(srcdir)/config/$(LIB1ASMSRC)
+	$(gcc_compile) -DL$* -xassembler-with-cpp -c $<
 libgcc-objects += $(lib1asmfuncs-o)
 
 endif
diff --git a/libgcc/config.host b/libgcc/config.host
index 01e2f21a797..0a05ea184b0 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -306,22 +306,25 @@ alpha*-dec-*vms*)
 	md_unwind_header=alpha/vms-unwind.h
 	;;
 arm-wrs-vxworks)
-	tmake_file="$tmake_file t-fdpbit"
+	tmake_file="$tmake_file arm/t-arm arm/t-vxworks t-fdpbit"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 arm*-*-freebsd*)
-	tmake_file="$tmake_file t-fdpbit"
+	tmake_file="$tmake_file arm/t-arm arm/t-strongarm-elf t-fdpbit"
 	;;
 arm*-*-netbsdelf*)
-	tmake_file="$tmake_file t-slibgcc-gld-nover"
+	tmake_file="$tmake_file arm/t-arm t-slibgcc-gld-nover"
 	;;
 arm*-*-linux*)			# ARM GNU/Linux with ELF
-	tmake_file="${tmake_file} t-fixedpoint-gnu-prefix"
+	tmake_file="${tmake_file} arm/t-arm t-fixedpoint-gnu-prefix"
 	case ${host} in
 	arm*-*-linux-*eabi)
-	  tmake_file="${tmake_file} arm/t-bpabi t-slibgcc-libgcc"
+	  tmake_file="${tmake_file} arm/t-elf arm/t-bpabi arm/t-linux-eabi t-slibgcc-libgcc"
 	  unwind_header=config/arm/unwind-arm.h
 	  ;;
+	*)
+	  tmake_file="$tmake_file arm/t-linux"
+ 	  ;;
 	esac
 	tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp"
 	;;
@@ -333,15 +336,15 @@ arm*-*-uclinux*)		# ARM ucLinux
 	  unwind_header=config/arm/unwind-arm.h
 	  ;;
 	esac
-	tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp"
+	tmake_file="$tmake_file arm/t-arm arm/t-elf t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 arm*-*-ecos-elf)
-	tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp"
+	tmake_file="$tmake_file arm/t-arm arm/t-elf t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 arm*-*-eabi* | arm*-*-symbianelf* )
-	tmake_file="${tmake_file} t-fixedpoint-gnu-prefix"
+	tmake_file="${tmake_file} arm/t-arm arm/t-elf t-fixedpoint-gnu-prefix"
 	case ${host} in
 	arm*-*-eabi*)
 	  tmake_file="${tmake_file} arm/t-bpabi"
@@ -356,17 +359,18 @@ arm*-*-eabi* | arm*-*-symbianelf* )
 	unwind_header=config/arm/unwind-arm.h
 	;;
 arm*-*-rtems*)
-	tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp"
+	tmake_file="$tmake_file arm/t-arm arm/t-elf t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 arm*-*-elf)
-	tmake_file="$tmake_file t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp"
+	tmake_file="$tmake_file arm/t-arm arm/t-elf t-softfp-sfdf t-softfp-excl arm/t-softfp t-softfp"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 arm*-wince-pe*)
+	tmake_file="$tmake_file arm/t-arm arm/t-wince-pe"
 	;;
 avr-*-rtems*)
-	tmake_file=t-fpbit
+	tmake_file="$tmake_file avr/t-avr t-fpbit"
 	# Don't use default.
 	extra_parts=
 	;;
@@ -375,27 +379,27 @@ avr-*-*)
 	tmake_file="${cpu_type}/t-avr t-fpbit"
 	;;
 bfin*-elf*)
-	tmake_file="bfin/t-crtlibid bfin/t-crtstuff t-fdpbit"
+	tmake_file="bfin/t-bfin bfin/t-crtlibid bfin/t-crtstuff t-fdpbit"
 	extra_parts="$extra_parts crtbeginS.o crtendS.o crti.o crtn.o crtlibid.o"
         ;;
 bfin*-uclinux*)
-	tmake_file="bfin/t-crtlibid bfin/t-crtstuff t-fdpbit"
+	tmake_file="bfin/t-bfin bfin/t-crtlibid bfin/t-crtstuff t-fdpbit"
 	extra_parts="$extra_parts crtbeginS.o crtendS.o crtlibid.o"
 	md_unwind_header=bfin/linux-unwind.h
         ;;
 bfin*-linux-uclibc*)
-	tmake_file="$tmake_file bfin/t-crtstuff t-fdpbit bfin/t-linux"
+	tmake_file="$tmake_file bfin/t-bfin bfin/t-crtstuff t-fdpbit bfin/t-linux"
 	# No need to build crtbeginT.o on uClibc systems.  Should probably
 	# be moved to the OS specific section above.
 	extra_parts="crtbegin.o crtbeginS.o crtend.o crtendS.o"
 	md_unwind_header=bfin/linux-unwind.h
 	;;
 bfin*-rtems*)
-	tmake_file="$tmake_file t-fdpbit"
+	tmake_file="$tmake_file bfin/t-bfin t-fdpbit"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 bfin*-*)
-	tmake_file="$tmake_file t-fdpbit"
+	tmake_file="$tmake_file bfin/t-bfin t-fdpbit"
 	extra_parts="crtbegin.o crtend.o crti.o crtn.o"
         ;;
 crisv32-*-elf)
@@ -415,10 +419,8 @@ cris-*-none)
 cris-*-linux* | crisv32-*-linux*)
 	tmake_file="$tmake_file t-fdpbit cris/t-linux"
 	;;
-fido-*-elf)
-	;;
 fr30-*-elf)
-	tmake_file="$tmake_file t-fdpbit"
+	tmake_file="$tmake_file fr30/t-fr30 t-fdpbit"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 frv-*-elf)
@@ -427,20 +429,21 @@ frv-*-elf)
 	extra_parts="frvbegin.o frvend.o"
 	;;
 frv-*-*linux*)
-	tmake_file="$tmake_file t-fdpbit frv/t-linux"
+	tmake_file="$tmake_file frv/t-frv frv/t-linux t-fdpbit"
 	;;
 h8300-*-rtems*)
-	tmake_file="$tmake_file t-fpbit"
+	tmake_file="$tmake_file h8300/t-h8300 t-fpbit"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 h8300-*-elf*)
-	tmake_file="$tmake_file t-fpbit"
+	tmake_file="$tmake_file h8300/t-h8300 t-fpbit"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 hppa*64*-*-linux*)
+	tmake_file="$tmake_file pa/t-linux pa/t-linux64"
 	;;
 hppa*-*-linux*)
-	tmake_file="$tmake_file t-slibgcc-libgcc"
+	tmake_file="$tmake_file pa/t-linux t-slibgcc-libgcc"
 	# Set the libgcc version number
 	if test x$enable_sjlj_exceptions = xyes; then
 	    tmake_file="$tmake_file pa/t-slibgcc-sjlj-ver"
@@ -565,7 +568,7 @@ i[34567]86-*-cygwin*)
 	else
 		tmake_dlldir_file="i386/t-dlldir-x"
 	fi
-	tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-cygming i386/t-cygwin i386/t-crtfm t-dfprules"
+	tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-cygming i386/t-cygwin i386/t-crtfm i386/t-chkstk t-dfprules"
 	case ${target_thread_file} in
 	  posix)
 	    tmake_file="i386/t-mingw-pthread $tmake_file"
@@ -586,7 +589,7 @@ i[34567]86-*-mingw*)
 	else
 		tmake_dlldir_file="i386/t-dlldir-x"
 	fi
-	tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-cygming i386/t-mingw32 i386/t-crtfm t-dfprules"
+	tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-cygming i386/t-mingw32 i386/t-crtfm i386/t-chkstk t-dfprules"
 	md_unwind_header=i386/w32-unwind.h
 	;;
 x86_64-*-mingw*)
@@ -602,10 +605,11 @@ x86_64-*-mingw*)
 	else
 		tmake_dlldir_file="i386/t-dlldir-x"
 	fi
-	tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-mingw32 t-dfprules i386/t-crtfm"
+	tmake_file="${tmake_file} ${tmake_eh_file} ${tmake_dlldir_file} i386/t-slibgcc-cygming i386/t-mingw32 t-dfprules i386/t-crtfm i386/t-chkstk"
 	extra_parts="$extra_parts crtfastmath.o"
 	;;
 i[34567]86-*-interix3*)
+	tmake_file="$tmake_file i386/t-chkstk"
 	;;
 ia64*-*-elf*)
 	extra_parts="$extra_parts crtbeginS.o crtendS.o crtfastmath.o"
@@ -625,10 +629,10 @@ ia64*-*-linux*)
 	md_unwind_header=ia64/linux-unwind.h
 	;;
 ia64*-*-hpux*)
-	tmake_file="ia64/t-hpux t-slibgcc ia64/t-slibgcc-hpux t-slibgcc-hpux"
+	tmake_file="ia64/t-ia64 ia64/t-hpux t-slibgcc ia64/t-slibgcc-hpux t-slibgcc-hpux"
 	;;
 ia64-hp-*vms*)
-	tmake_file="$tmake_file ia64/t-eh-ia64 ia64/t-vms t-slibgcc-vms"
+	tmake_file="$tmake_file ia64/t-ia64 ia64/t-eh-ia64 ia64/t-vms t-slibgcc-vms"
 	extra_parts="$extra_parts crtinitS.o"
 	md_unwind_header=ia64/vms-unwind.h
 	;;
@@ -660,18 +664,21 @@ m32r-*-linux*)
 m32rle-*-linux*)
 	tmake_file="$tmake_file m32r/t-linux t-fdpbit"
 	;;
-m68k-*-elf*)
+m68k-*-elf* | fido-*-elf)
+	tmake_file="$tmake_file m68k/t-floatlib"
 	;;
 m68k*-*-netbsdelf*)
 	;;
 m68k*-*-openbsd*)
 	;;
 m68k-*-uclinux*)		# Motorola m68k/ColdFire running uClinux with uClibc
+	tmake_file="$tmake_file m68k/t-floatlib"
 	md_unwind_header=m68k/linux-unwind.h
 	;;
 m68k-*-linux*)			# Motorola m68k's running GNU/Linux
 				# with ELF format using glibc 2
 				# aka the GNU/Linux C library 6.
+	tmake_file="$tmake_file m68k/t-floatlib"
 	# If not configured with --enable-sjlj-exceptions, bump the
 	# libgcc version number.
 	if test x$enable_sjlj_exceptions != xyes; then
@@ -680,10 +687,11 @@ m68k-*-linux*)			# Motorola m68k's running GNU/Linux
 	md_unwind_header=m68k/linux-unwind.h
 	;;
 m68k-*-rtems*)
+	tmake_file="$tmake_file m68k/t-floatlib"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 mcore-*-elf)
-	tmake_file=t-fdpbit
+	tmake_file="mcore/t-mcore t-fdpbit"
 	extra_parts="$extra_parts crti.o crtn.o"
 	;;
 microblaze*-linux*)
@@ -905,6 +913,10 @@ sh-*-elf* | sh[12346l]*-*-elf*)
 		libic_invalidate_array_4-200.a \
 		libic_invalidate_array_4a.a \
 		libgcc-Os-4-200.a libgcc-4-300.a"
+	case ${host} in sh64*-*-*)
+		tmake_file="$tmake_file sh/t-sh64"
+		;;
+	esac
 	case ${host} in
 	sh*-superh-elf)
 		tmake_file="$tmake_file sh/t-superh"
@@ -913,23 +925,33 @@ sh-*-elf* | sh[12346l]*-*-elf*)
  	esac
 	;;
 sh-*-linux* | sh[2346lbe]*-*-linux*)
-	tmake_file="${tmake_file} t-slibgcc-libgcc sh/t-linux t-fdpbit"
+	tmake_file="${tmake_file} sh/t-sh t-slibgcc-libgcc sh/t-linux t-fdpbit"
+	case ${host} in sh64*-*-linux*)
+		tmake_file="$tmake_file sh/t-sh64"
+		;;
+	esac
 	md_unwind_header=sh/linux-unwind.h
 	;;
 sh-*-netbsdelf* | shl*-*-netbsdelf* | sh5-*-netbsd* | sh5l*-*-netbsd* | \
   sh64-*-netbsd* | sh64l*-*-netbsd*)
+	tmake_file="$tmake_file sh/t-sh sh/t-netbsd"
+	case ${host} in
+	sh5*-*-netbsd* | sh64*-netbsd*)
+		tmake_file="$tmake_file sh/t-sh64"
+		;;
+	esac
 	# NetBSD's C library includes a fast software FP library that
 	# has support for setting/setting the rounding mode, exception
 	# mask, etc.  Therefore, we don't want to include software FP
 	# in libgcc.
 	;;
 sh-*-rtems*)
-	tmake_file="$tmake_file t-crtstuff-pic t-fdpbit"
+	tmake_file="$tmake_file sh/t-sh t-crtstuff-pic t-fdpbit"
 	extra_parts="$extra_parts crt1.o crti.o crtn.o crtbeginS.o crtendS.o \
 		$sh_ic_extra_parts $sh_opt_extra_parts"
 	;;
 sh-wrs-vxworks)
-	tmake_file="$tmake_file t-crtstuff-pic t-fdpbit"
+	tmake_file="$tmake_file sh/t-sh t-crtstuff-pic t-fdpbit"
 	;;
 sparc-*-netbsdelf*)
 	;;
@@ -956,6 +978,13 @@ sparc-*-linux*)		# SPARC's running GNU/Linux, libc6
 		tmake_file="${tmake_file} sparc/t-linux"
 		;;
 	esac
+	case ${host} in
+	*-leon[3-9]*)
+		;;
+	*)
+	  	tmake_file="$tmake_file sparc/t-softmul"
+	  	;;
+	esac
 	extra_parts="$extra_parts crtfastmath.o"
 	md_unwind_header=sparc/linux-unwind.h
 	;;
@@ -1007,9 +1036,10 @@ tic6x-*-elf)
 	unwind_header=config/c6x/unwind-c6x.h
 	;;
 v850*-*-*)
-	tmake_file=t-fdpbit
+	tmake_file="v850/t-v850 t-fdpbit"
 	;;
 vax-*-linux*)
+	tmake_file="$tmake_file vax/t-linux"
 	;;
 vax-*-netbsdelf*)
 	;;
@@ -1032,6 +1062,7 @@ am33_2.0-*-linux*)
 	tmake_file="$tmake_file t-fdpbit"
 	;;
 m32c-*-elf*|m32c-*-rtems*)
+	tmake_file="$tmake_file m32c/t-m32c"
  	;;
 mep*-*-*)
 	tmake_file="mep/t-mep t-fdpbit"
diff --git a/libgcc/config/arm/bpabi-v6m.S b/libgcc/config/arm/bpabi-v6m.S
new file mode 100644
index 00000000000..4ecea6da5a6
--- /dev/null
+++ b/libgcc/config/arm/bpabi-v6m.S
@@ -0,0 +1,318 @@
+/* Miscellaneous BPABI functions.  ARMv6M implementation
+
+   Copyright (C) 2006, 2008, 2009, 2010  Free Software Foundation, Inc.
+   Contributed by CodeSourcery.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef __ARM_EABI__
+/* Some attributes that are common to all routines in this file.  */
+	/* Tag_ABI_align_needed: This code does not require 8-byte
+	   alignment from the caller.  */
+	/* .eabi_attribute 24, 0  -- default setting.  */
+	/* Tag_ABI_align_preserved: This code preserves 8-byte
+	   alignment in any callee.  */
+	.eabi_attribute 25, 1
+#endif /* __ARM_EABI__ */
+
+#ifdef L_aeabi_lcmp
+
+FUNC_START aeabi_lcmp
+	cmp	xxh, yyh
+	beq	1f
+	bgt	2f
+	mov	r0, #1
+	neg	r0, r0
+	RET
+2:
+	mov	r0, #1
+	RET
+1:
+	sub	r0, xxl, yyl
+	beq	1f
+	bhi	2f
+	mov	r0, #1
+	neg	r0, r0
+	RET
+2:
+	mov	r0, #1
+1:
+	RET
+	FUNC_END aeabi_lcmp
+
+#endif /* L_aeabi_lcmp */
+	
+#ifdef L_aeabi_ulcmp
+
+FUNC_START aeabi_ulcmp
+	cmp	xxh, yyh
+	bne	1f
+	sub	r0, xxl, yyl
+	beq	2f
+1:
+	bcs	1f
+	mov	r0, #1
+	neg	r0, r0
+	RET
+1:
+	mov	r0, #1
+2:
+	RET
+	FUNC_END aeabi_ulcmp
+
+#endif /* L_aeabi_ulcmp */
+
+.macro test_div_by_zero signed
+	cmp	yyh, #0
+	bne	7f
+	cmp	yyl, #0
+	bne	7f
+	cmp	xxh, #0
+	bne	2f
+	cmp	xxl, #0
+2:
+	.ifc	\signed, unsigned
+	beq	3f
+	mov	xxh, #0
+	mvn	xxh, xxh		@ 0xffffffff
+	mov	xxl, xxh
+3:
+	.else
+	beq	5f
+	blt	6f
+	mov	xxl, #0
+	mvn	xxl, xxl		@ 0xffffffff
+	lsr	xxh, xxl, #1		@ 0x7fffffff
+	b	5f
+6:	mov	xxh, #0x80
+	lsl	xxh, xxh, #24		@ 0x80000000
+	mov	xxl, #0
+5:
+	.endif
+	@ tailcalls are tricky on v6-m.
+	push	{r0, r1, r2}
+	ldr	r0, 1f
+	adr	r1, 1f
+	add	r0, r1
+	str	r0, [sp, #8]
+	@ We know we are not on armv4t, so pop pc is safe.
+	pop	{r0, r1, pc}
+	.align	2
+1:
+	.word	__aeabi_ldiv0 - 1b
+7:
+.endm
+
+#ifdef L_aeabi_ldivmod
+
+FUNC_START aeabi_ldivmod
+	test_div_by_zero signed
+
+	push {r0, r1}
+	mov r0, sp
+	push {r0, lr}
+	ldr r0, [sp, #8]
+	bl SYM(__gnu_ldivmod_helper)
+	ldr r3, [sp, #4]
+	mov lr, r3
+	add sp, sp, #8
+	pop {r2, r3}
+	RET
+	FUNC_END aeabi_ldivmod
+
+#endif /* L_aeabi_ldivmod */
+
+#ifdef L_aeabi_uldivmod
+
+FUNC_START aeabi_uldivmod
+	test_div_by_zero unsigned
+
+	push {r0, r1}
+	mov r0, sp
+	push {r0, lr}
+	ldr r0, [sp, #8]
+	bl SYM(__gnu_uldivmod_helper)
+	ldr r3, [sp, #4]
+	mov lr, r3
+	add sp, sp, #8
+	pop {r2, r3}
+	RET
+	FUNC_END aeabi_uldivmod
+	
+#endif /* L_aeabi_uldivmod */
+
+#ifdef L_arm_addsubsf3
+
+FUNC_START aeabi_frsub
+
+      push	{r4, lr}
+      mov	r4, #1
+      lsl	r4, #31
+      eor	r0, r0, r4
+      bl	__aeabi_fadd
+      pop	{r4, pc}
+
+      FUNC_END aeabi_frsub
+
+#endif /* L_arm_addsubsf3 */
+
+#ifdef L_arm_cmpsf2
+
+FUNC_START aeabi_cfrcmple
+
+	mov	ip, r0
+	mov	r0, r1
+	mov	r1, ip
+	b	6f
+
+FUNC_START aeabi_cfcmpeq
+FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq
+
+	@ The status-returning routines are required to preserve all
+	@ registers except ip, lr, and cpsr.
+6:	push	{r0, r1, r2, r3, r4, lr}
+	bl	__lesf2
+	@ Set the Z flag correctly, and the C flag unconditionally.
+	cmp	r0, #0
+	@ Clear the C flag if the return value was -1, indicating
+	@ that the first operand was smaller than the second.
+	bmi 1f
+	mov	r1, #0
+	cmn	r0, r1
+1:
+	pop	{r0, r1, r2, r3, r4, pc}
+
+	FUNC_END aeabi_cfcmple
+	FUNC_END aeabi_cfcmpeq
+	FUNC_END aeabi_cfrcmple
+
+FUNC_START	aeabi_fcmpeq
+
+	push	{r4, lr}
+	bl	__eqsf2
+	neg	r0, r0
+	add	r0, r0, #1
+	pop	{r4, pc}
+
+	FUNC_END aeabi_fcmpeq
+
+.macro COMPARISON cond, helper, mode=sf2
+FUNC_START	aeabi_fcmp\cond
+
+	push	{r4, lr}
+	bl	__\helper\mode
+	cmp	r0, #0
+	b\cond	1f
+	mov	r0, #0
+	pop	{r4, pc}
+1:
+	mov	r0, #1
+	pop	{r4, pc}
+
+	FUNC_END aeabi_fcmp\cond
+.endm
+
+COMPARISON lt, le
+COMPARISON le, le
+COMPARISON gt, ge
+COMPARISON ge, ge
+
+#endif /* L_arm_cmpsf2 */
+
+#ifdef L_arm_addsubdf3
+
+FUNC_START aeabi_drsub
+
+      push	{r4, lr}
+      mov	r4, #1
+      lsl	r4, #31
+      eor	xxh, xxh, r4
+      bl	__aeabi_dadd
+      pop	{r4, pc}
+
+      FUNC_END aeabi_drsub
+
+#endif /* L_arm_addsubdf3 */
+
+#ifdef L_arm_cmpdf2
+
+FUNC_START aeabi_cdrcmple
+
+	mov	ip, r0
+	mov	r0, r2
+	mov	r2, ip
+	mov	ip, r1
+	mov	r1, r3
+	mov	r3, ip
+	b	6f
+
+FUNC_START aeabi_cdcmpeq
+FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq
+
+	@ The status-returning routines are required to preserve all
+	@ registers except ip, lr, and cpsr.
+6:	push	{r0, r1, r2, r3, r4, lr}
+	bl	__ledf2
+	@ Set the Z flag correctly, and the C flag unconditionally.
+	cmp	r0, #0
+	@ Clear the C flag if the return value was -1, indicating
+	@ that the first operand was smaller than the second.
+	bmi 1f
+	mov	r1, #0
+	cmn	r0, r1
+1:
+	pop	{r0, r1, r2, r3, r4, pc}
+
+	FUNC_END aeabi_cdcmple
+	FUNC_END aeabi_cdcmpeq
+	FUNC_END aeabi_cdrcmple
+
+FUNC_START	aeabi_dcmpeq
+
+	push	{r4, lr}
+	bl	__eqdf2
+	neg	r0, r0
+	add	r0, r0, #1
+	pop	{r4, pc}
+
+	FUNC_END aeabi_dcmpeq
+
+.macro COMPARISON cond, helper, mode=df2
+FUNC_START	aeabi_dcmp\cond
+
+	push	{r4, lr}
+	bl	__\helper\mode
+	cmp	r0, #0
+	b\cond	1f
+	mov	r0, #0
+	pop	{r4, pc}
+1:
+	mov	r0, #1
+	pop	{r4, pc}
+
+	FUNC_END aeabi_dcmp\cond
+.endm
+
+COMPARISON lt, le
+COMPARISON le, le
+COMPARISON gt, ge
+COMPARISON ge, ge
+
+#endif /* L_arm_cmpdf2 */
diff --git a/libgcc/config/arm/bpabi.S b/libgcc/config/arm/bpabi.S
new file mode 100644
index 00000000000..2ff338927fa
--- /dev/null
+++ b/libgcc/config/arm/bpabi.S
@@ -0,0 +1,163 @@
+/* Miscellaneous BPABI functions.
+
+   Copyright (C) 2003, 2004, 2007, 2008, 2009, 2010
+   Free Software Foundation, Inc.
+   Contributed by CodeSourcery, LLC.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef __ARM_EABI__
+/* Some attributes that are common to all routines in this file.  */
+	/* Tag_ABI_align_needed: This code does not require 8-byte
+	   alignment from the caller.  */
+	/* .eabi_attribute 24, 0  -- default setting.  */
+	/* Tag_ABI_align_preserved: This code preserves 8-byte
+	   alignment in any callee.  */
+	.eabi_attribute 25, 1
+#endif /* __ARM_EABI__ */
+
+#ifdef L_aeabi_lcmp
+
+ARM_FUNC_START aeabi_lcmp
+	cmp	xxh, yyh
+	do_it	lt
+	movlt	r0, #-1
+	do_it	gt
+	movgt	r0, #1
+	do_it	ne
+	RETc(ne)
+	subs	r0, xxl, yyl
+	do_it	lo
+	movlo	r0, #-1
+	do_it	hi
+	movhi	r0, #1
+	RET
+	FUNC_END aeabi_lcmp
+
+#endif /* L_aeabi_lcmp */
+	
+#ifdef L_aeabi_ulcmp
+
+ARM_FUNC_START aeabi_ulcmp
+	cmp	xxh, yyh
+	do_it	lo
+	movlo	r0, #-1
+	do_it	hi
+	movhi	r0, #1
+	do_it	ne
+	RETc(ne)
+	cmp	xxl, yyl
+	do_it	lo
+	movlo	r0, #-1
+	do_it	hi
+	movhi	r0, #1
+	do_it	eq
+	moveq	r0, #0
+	RET
+	FUNC_END aeabi_ulcmp
+
+#endif /* L_aeabi_ulcmp */
+
+.macro test_div_by_zero signed
+/* Tail-call to divide-by-zero handlers which may be overridden by the user,
+   so unwinding works properly.  */
+#if defined(__thumb2__)
+	cbnz	yyh, 1f
+	cbnz	yyl, 1f
+	cmp	xxh, #0
+	do_it	eq
+	cmpeq	xxl, #0
+	.ifc \signed, unsigned
+	beq	2f
+	mov	xxh, #0xffffffff
+	mov	xxl, xxh
+2:
+	.else
+	do_it	lt, t
+	movlt	xxl, #0
+	movlt	xxh, #0x80000000
+	do_it	gt, t
+	movgt	xxh, #0x7fffffff
+	movgt	xxl, #0xffffffff
+	.endif
+	b	SYM (__aeabi_ldiv0) __PLT__
+1:
+#else
+	/* Note: Thumb-1 code calls via an ARM shim on processors which
+	   support ARM mode.  */
+	cmp	yyh, #0
+	cmpeq	yyl, #0
+	bne	2f
+	cmp	xxh, #0
+	cmpeq	xxl, #0
+	.ifc \signed, unsigned
+	movne	xxh, #0xffffffff
+	movne	xxl, #0xffffffff
+	.else
+	movlt	xxh, #0x80000000
+	movlt	xxl, #0
+	movgt	xxh, #0x7fffffff
+	movgt	xxl, #0xffffffff
+	.endif
+	b	SYM (__aeabi_ldiv0) __PLT__
+2:
+#endif
+.endm
+
+#ifdef L_aeabi_ldivmod
+
+ARM_FUNC_START aeabi_ldivmod
+	test_div_by_zero signed
+
+	sub sp, sp, #8
+#if defined(__thumb2__)
+	mov ip, sp
+	push {ip, lr}
+#else
+	do_push {sp, lr}
+#endif
+	bl SYM(__gnu_ldivmod_helper) __PLT__
+	ldr lr, [sp, #4]
+	add sp, sp, #8
+	do_pop {r2, r3}
+	RET
+	
+#endif /* L_aeabi_ldivmod */
+
+#ifdef L_aeabi_uldivmod
+
+ARM_FUNC_START aeabi_uldivmod
+	test_div_by_zero unsigned
+
+	sub sp, sp, #8
+#if defined(__thumb2__)
+	mov ip, sp
+	push {ip, lr}
+#else
+	do_push {sp, lr}
+#endif
+	bl SYM(__gnu_uldivmod_helper) __PLT__
+	ldr lr, [sp, #4]
+	add sp, sp, #8
+	do_pop {r2, r3}
+	RET
+	
+#endif /* L_aeabi_divmod */
+	
diff --git a/libgcc/config/arm/ieee754-df.S b/libgcc/config/arm/ieee754-df.S
new file mode 100644
index 00000000000..eb0c38632d0
--- /dev/null
+++ b/libgcc/config/arm/ieee754-df.S
@@ -0,0 +1,1447 @@
+/* ieee754-df.S double-precision floating point support for ARM
+
+   Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009  Free Software Foundation, Inc.
+   Contributed by Nicolas Pitre (nico@cam.org)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/*
+ * Notes: 
+ * 
+ * The goal of this code is to be as fast as possible.  This is
+ * not meant to be easy to understand for the casual reader.
+ * For slightly simpler code please see the single precision version
+ * of this file.
+ * 
+ * Only the default rounding mode is intended for best performances.
+ * Exceptions aren't supported yet, but that can be added quite easily
+ * if necessary without impacting performances.
+ */
+
+
+@ For FPA, float words are always big-endian.
+@ For VFP, floats words follow the memory system mode.
+#if defined(__VFP_FP__) && !defined(__ARMEB__)
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
+#else
+#define xh r0
+#define xl r1
+#define yh r2
+#define yl r3
+#endif
+
+
+#ifdef L_arm_negdf2
+
+ARM_FUNC_START negdf2
+ARM_FUNC_ALIAS aeabi_dneg negdf2
+
+	@ flip sign bit
+	eor	xh, xh, #0x80000000
+	RET
+
+	FUNC_END aeabi_dneg
+	FUNC_END negdf2
+
+#endif
+
+#ifdef L_arm_addsubdf3
+
+ARM_FUNC_START aeabi_drsub
+
+	eor	xh, xh, #0x80000000	@ flip sign bit of first arg
+	b	1f	
+
+ARM_FUNC_START subdf3
+ARM_FUNC_ALIAS aeabi_dsub subdf3
+
+	eor	yh, yh, #0x80000000	@ flip sign bit of second arg
+#if defined(__INTERWORKING_STUBS__)
+	b	1f			@ Skip Thumb-code prologue
+#endif
+
+ARM_FUNC_START adddf3
+ARM_FUNC_ALIAS aeabi_dadd adddf3
+
+1:	do_push	{r4, r5, lr}
+
+	@ Look for zeroes, equal values, INF, or NAN.
+	shift1	lsl, r4, xh, #1
+	shift1	lsl, r5, yh, #1
+	teq	r4, r5
+	do_it	eq
+	teqeq	xl, yl
+	do_it	ne, ttt
+	COND(orr,s,ne)	ip, r4, xl
+	COND(orr,s,ne)	ip, r5, yl
+	COND(mvn,s,ne)	ip, r4, asr #21
+	COND(mvn,s,ne)	ip, r5, asr #21
+	beq	LSYM(Lad_s)
+
+	@ Compute exponent difference.  Make largest exponent in r4,
+	@ corresponding arg in xh-xl, and positive exponent difference in r5.
+	shift1	lsr, r4, r4, #21
+	rsbs	r5, r4, r5, lsr #21
+	do_it	lt
+	rsblt	r5, r5, #0
+	ble	1f
+	add	r4, r4, r5
+	eor	yl, xl, yl
+	eor	yh, xh, yh
+	eor	xl, yl, xl
+	eor	xh, yh, xh
+	eor	yl, xl, yl
+	eor	yh, xh, yh
+1:
+	@ If exponent difference is too large, return largest argument
+	@ already in xh-xl.  We need up to 54 bit to handle proper rounding
+	@ of 0x1p54 - 1.1.
+	cmp	r5, #54
+	do_it	hi
+	RETLDM	"r4, r5" hi
+
+	@ Convert mantissa to signed integer.
+	tst	xh, #0x80000000
+	mov	xh, xh, lsl #12
+	mov	ip, #0x00100000
+	orr	xh, ip, xh, lsr #12
+	beq	1f
+#if defined(__thumb2__)
+	negs	xl, xl
+	sbc	xh, xh, xh, lsl #1
+#else
+	rsbs	xl, xl, #0
+	rsc	xh, xh, #0
+#endif
+1:
+	tst	yh, #0x80000000
+	mov	yh, yh, lsl #12
+	orr	yh, ip, yh, lsr #12
+	beq	1f
+#if defined(__thumb2__)
+	negs	yl, yl
+	sbc	yh, yh, yh, lsl #1
+#else
+	rsbs	yl, yl, #0
+	rsc	yh, yh, #0
+#endif
+1:
+	@ If exponent == difference, one or both args were denormalized.
+	@ Since this is not common case, rescale them off line.
+	teq	r4, r5
+	beq	LSYM(Lad_d)
+LSYM(Lad_x):
+
+	@ Compensate for the exponent overlapping the mantissa MSB added later
+	sub	r4, r4, #1
+
+	@ Shift yh-yl right per r5, add to xh-xl, keep leftover bits into ip.
+	rsbs	lr, r5, #32
+	blt	1f
+	shift1	lsl, ip, yl, lr
+	shiftop adds xl xl yl lsr r5 yl
+	adc	xh, xh, #0
+	shiftop adds xl xl yh lsl lr yl
+	shiftop adcs xh xh yh asr r5 yh
+	b	2f
+1:	sub	r5, r5, #32
+	add	lr, lr, #32
+	cmp	yl, #1
+	shift1	lsl,ip, yh, lr
+	do_it	cs
+	orrcs	ip, ip, #2		@ 2 not 1, to allow lsr #1 later
+	shiftop adds xl xl yh asr r5 yh
+	adcs	xh, xh, yh, asr #31
+2:
+	@ We now have a result in xh-xl-ip.
+	@ Keep absolute value in xh-xl-ip, sign in r5 (the n bit was set above)
+	and	r5, xh, #0x80000000
+	bpl	LSYM(Lad_p)
+#if defined(__thumb2__)
+	mov	lr, #0
+	negs	ip, ip
+	sbcs	xl, lr, xl
+	sbc	xh, lr, xh
+#else
+	rsbs	ip, ip, #0
+	rscs	xl, xl, #0
+	rsc	xh, xh, #0
+#endif
+
+	@ Determine how to normalize the result.
+LSYM(Lad_p):
+	cmp	xh, #0x00100000
+	bcc	LSYM(Lad_a)
+	cmp	xh, #0x00200000
+	bcc	LSYM(Lad_e)
+
+	@ Result needs to be shifted right.
+	movs	xh, xh, lsr #1
+	movs	xl, xl, rrx
+	mov	ip, ip, rrx
+	add	r4, r4, #1
+
+	@ Make sure we did not bust our exponent.
+	mov	r2, r4, lsl #21
+	cmn	r2, #(2 << 21)
+	bcs	LSYM(Lad_o)
+
+	@ Our result is now properly aligned into xh-xl, remaining bits in ip.
+	@ Round with MSB of ip. If halfway between two numbers, round towards
+	@ LSB of xl = 0.
+	@ Pack final result together.
+LSYM(Lad_e):
+	cmp	ip, #0x80000000
+	do_it	eq
+	COND(mov,s,eq)	ip, xl, lsr #1
+	adcs	xl, xl, #0
+	adc	xh, xh, r4, lsl #20
+	orr	xh, xh, r5
+	RETLDM	"r4, r5"
+
+	@ Result must be shifted left and exponent adjusted.
+LSYM(Lad_a):
+	movs	ip, ip, lsl #1
+	adcs	xl, xl, xl
+	adc	xh, xh, xh
+	tst	xh, #0x00100000
+	sub	r4, r4, #1
+	bne	LSYM(Lad_e)
+
+	@ No rounding necessary since ip will always be 0 at this point.
+LSYM(Lad_l):
+
+#if __ARM_ARCH__ < 5
+
+	teq	xh, #0
+	movne	r3, #20
+	moveq	r3, #52
+	moveq	xh, xl
+	moveq	xl, #0
+	mov	r2, xh
+	cmp	r2, #(1 << 16)
+	movhs	r2, r2, lsr #16
+	subhs	r3, r3, #16
+	cmp	r2, #(1 << 8)
+	movhs	r2, r2, lsr #8
+	subhs	r3, r3, #8
+	cmp	r2, #(1 << 4)
+	movhs	r2, r2, lsr #4
+	subhs	r3, r3, #4
+	cmp	r2, #(1 << 2)
+	subhs	r3, r3, #2
+	sublo	r3, r3, r2, lsr #1
+	sub	r3, r3, r2, lsr #3
+
+#else
+
+	teq	xh, #0
+	do_it	eq, t
+	moveq	xh, xl
+	moveq	xl, #0
+	clz	r3, xh
+	do_it	eq
+	addeq	r3, r3, #32
+	sub	r3, r3, #11
+
+#endif
+
+	@ determine how to shift the value.
+	subs	r2, r3, #32
+	bge	2f
+	adds	r2, r2, #12
+	ble	1f
+
+	@ shift value left 21 to 31 bits, or actually right 11 to 1 bits
+	@ since a register switch happened above.
+	add	ip, r2, #20
+	rsb	r2, r2, #12
+	shift1	lsl, xl, xh, ip
+	shift1	lsr, xh, xh, r2
+	b	3f
+
+	@ actually shift value left 1 to 20 bits, which might also represent
+	@ 32 to 52 bits if counting the register switch that happened earlier.
+1:	add	r2, r2, #20
+2:	do_it	le
+	rsble	ip, r2, #32
+	shift1	lsl, xh, xh, r2
+#if defined(__thumb2__)
+	lsr	ip, xl, ip
+	itt	le
+	orrle	xh, xh, ip
+	lslle	xl, xl, r2
+#else
+	orrle	xh, xh, xl, lsr ip
+	movle	xl, xl, lsl r2
+#endif
+
+	@ adjust exponent accordingly.
+3:	subs	r4, r4, r3
+	do_it	ge, tt
+	addge	xh, xh, r4, lsl #20
+	orrge	xh, xh, r5
+	RETLDM	"r4, r5" ge
+
+	@ Exponent too small, denormalize result.
+	@ Find out proper shift value.
+	mvn	r4, r4
+	subs	r4, r4, #31
+	bge	2f
+	adds	r4, r4, #12
+	bgt	1f
+
+	@ shift result right of 1 to 20 bits, sign is in r5.
+	add	r4, r4, #20
+	rsb	r2, r4, #32
+	shift1	lsr, xl, xl, r4
+	shiftop orr xl xl xh lsl r2 yh
+	shiftop orr xh r5 xh lsr r4 yh
+	RETLDM	"r4, r5"
+
+	@ shift result right of 21 to 31 bits, or left 11 to 1 bits after
+	@ a register switch from xh to xl.
+1:	rsb	r4, r4, #12
+	rsb	r2, r4, #32
+	shift1	lsr, xl, xl, r2
+	shiftop orr xl xl xh lsl r4 yh
+	mov	xh, r5
+	RETLDM	"r4, r5"
+
+	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
+	@ from xh to xl.
+2:	shift1	lsr, xl, xh, r4
+	mov	xh, r5
+	RETLDM	"r4, r5"
+
+	@ Adjust exponents for denormalized arguments.
+	@ Note that r4 must not remain equal to 0.
+LSYM(Lad_d):
+	teq	r4, #0
+	eor	yh, yh, #0x00100000
+	do_it	eq, te
+	eoreq	xh, xh, #0x00100000
+	addeq	r4, r4, #1
+	subne	r5, r5, #1
+	b	LSYM(Lad_x)
+
+
+LSYM(Lad_s):
+	mvns	ip, r4, asr #21
+	do_it	ne
+	COND(mvn,s,ne)	ip, r5, asr #21
+	beq	LSYM(Lad_i)
+
+	teq	r4, r5
+	do_it	eq
+	teqeq	xl, yl
+	beq	1f
+
+	@ Result is x + 0.0 = x or 0.0 + y = y.
+	orrs	ip, r4, xl
+	do_it	eq, t
+	moveq	xh, yh
+	moveq	xl, yl
+	RETLDM	"r4, r5"
+
+1:	teq	xh, yh
+
+	@ Result is x - x = 0.
+	do_it	ne, tt
+	movne	xh, #0
+	movne	xl, #0
+	RETLDM	"r4, r5" ne
+
+	@ Result is x + x = 2x.
+	movs	ip, r4, lsr #21
+	bne	2f
+	movs	xl, xl, lsl #1
+	adcs	xh, xh, xh
+	do_it	cs
+	orrcs	xh, xh, #0x80000000
+	RETLDM	"r4, r5"
+2:	adds	r4, r4, #(2 << 21)
+	do_it	cc, t
+	addcc	xh, xh, #(1 << 20)
+	RETLDM	"r4, r5" cc
+	and	r5, xh, #0x80000000
+
+	@ Overflow: return INF.
+LSYM(Lad_o):
+	orr	xh, r5, #0x7f000000
+	orr	xh, xh, #0x00f00000
+	mov	xl, #0
+	RETLDM	"r4, r5"
+
+	@ At least one of x or y is INF/NAN.
+	@   if xh-xl != INF/NAN: return yh-yl (which is INF/NAN)
+	@   if yh-yl != INF/NAN: return xh-xl (which is INF/NAN)
+	@   if either is NAN: return NAN
+	@   if opposite sign: return NAN
+	@   otherwise return xh-xl (which is INF or -INF)
+LSYM(Lad_i):
+	mvns	ip, r4, asr #21
+	do_it	ne, te
+	movne	xh, yh
+	movne	xl, yl
+	COND(mvn,s,eq)	ip, r5, asr #21
+	do_it	ne, t
+	movne	yh, xh
+	movne	yl, xl
+	orrs	r4, xl, xh, lsl #12
+	do_it	eq, te
+	COND(orr,s,eq)	r5, yl, yh, lsl #12
+	teqeq	xh, yh
+	orrne	xh, xh, #0x00080000	@ quiet NAN
+	RETLDM	"r4, r5"
+
+	FUNC_END aeabi_dsub
+	FUNC_END subdf3
+	FUNC_END aeabi_dadd
+	FUNC_END adddf3
+
+ARM_FUNC_START floatunsidf
+ARM_FUNC_ALIAS aeabi_ui2d floatunsidf
+
+	teq	r0, #0
+	do_it	eq, t
+	moveq	r1, #0
+	RETc(eq)
+	do_push	{r4, r5, lr}
+	mov	r4, #0x400		@ initial exponent
+	add	r4, r4, #(52-1 - 1)
+	mov	r5, #0			@ sign bit is 0
+	.ifnc	xl, r0
+	mov	xl, r0
+	.endif
+	mov	xh, #0
+	b	LSYM(Lad_l)
+
+	FUNC_END aeabi_ui2d
+	FUNC_END floatunsidf
+
+ARM_FUNC_START floatsidf
+ARM_FUNC_ALIAS aeabi_i2d floatsidf
+
+	teq	r0, #0
+	do_it	eq, t
+	moveq	r1, #0
+	RETc(eq)
+	do_push	{r4, r5, lr}
+	mov	r4, #0x400		@ initial exponent
+	add	r4, r4, #(52-1 - 1)
+	ands	r5, r0, #0x80000000	@ sign bit in r5
+	do_it	mi
+	rsbmi	r0, r0, #0		@ absolute value
+	.ifnc	xl, r0
+	mov	xl, r0
+	.endif
+	mov	xh, #0
+	b	LSYM(Lad_l)
+
+	FUNC_END aeabi_i2d
+	FUNC_END floatsidf
+
+ARM_FUNC_START extendsfdf2
+ARM_FUNC_ALIAS aeabi_f2d extendsfdf2
+
+	movs	r2, r0, lsl #1		@ toss sign bit
+	mov	xh, r2, asr #3		@ stretch exponent
+	mov	xh, xh, rrx		@ retrieve sign bit
+	mov	xl, r2, lsl #28		@ retrieve remaining bits
+	do_it	ne, ttt
+	COND(and,s,ne)	r3, r2, #0xff000000	@ isolate exponent
+	teqne	r3, #0xff000000		@ if not 0, check if INF or NAN
+	eorne	xh, xh, #0x38000000	@ fixup exponent otherwise.
+	RETc(ne)			@ and return it.
+
+	teq	r2, #0			@ if actually 0
+	do_it	ne, e
+	teqne	r3, #0xff000000		@ or INF or NAN
+	RETc(eq)			@ we are done already.
+
+	@ value was denormalized.  We can normalize it now.
+	do_push	{r4, r5, lr}
+	mov	r4, #0x380		@ setup corresponding exponent
+	and	r5, xh, #0x80000000	@ move sign bit in r5
+	bic	xh, xh, #0x80000000
+	b	LSYM(Lad_l)
+
+	FUNC_END aeabi_f2d
+	FUNC_END extendsfdf2
+
+ARM_FUNC_START floatundidf
+ARM_FUNC_ALIAS aeabi_ul2d floatundidf
+
+	orrs	r2, r0, r1
+#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+	do_it	eq, t
+	mvfeqd	f0, #0.0
+#else
+	do_it	eq
+#endif
+	RETc(eq)
+
+#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+	@ For hard FPA code we want to return via the tail below so that
+	@ we can return the result in f0 as well as in r0/r1 for backwards
+	@ compatibility.
+	adr	ip, LSYM(f0_ret)
+	@ Push pc as well so that RETLDM works correctly.
+	do_push	{r4, r5, ip, lr, pc}
+#else
+	do_push	{r4, r5, lr}
+#endif
+
+	mov	r5, #0
+	b	2f
+
+ARM_FUNC_START floatdidf
+ARM_FUNC_ALIAS aeabi_l2d floatdidf
+
+	orrs	r2, r0, r1
+#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+	do_it	eq, t
+	mvfeqd	f0, #0.0
+#else
+	do_it	eq
+#endif
+	RETc(eq)
+
+#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+	@ For hard FPA code we want to return via the tail below so that
+	@ we can return the result in f0 as well as in r0/r1 for backwards
+	@ compatibility.
+	adr	ip, LSYM(f0_ret)
+	@ Push pc as well so that RETLDM works correctly.
+	do_push	{r4, r5, ip, lr, pc}
+#else
+	do_push	{r4, r5, lr}
+#endif
+
+	ands	r5, ah, #0x80000000	@ sign bit in r5
+	bpl	2f
+#if defined(__thumb2__)
+	negs	al, al
+	sbc	ah, ah, ah, lsl #1
+#else
+	rsbs	al, al, #0
+	rsc	ah, ah, #0
+#endif
+2:
+	mov	r4, #0x400		@ initial exponent
+	add	r4, r4, #(52-1 - 1)
+
+	@ FPA little-endian: must swap the word order.
+	.ifnc	xh, ah
+	mov	ip, al
+	mov	xh, ah
+	mov	xl, ip
+	.endif
+
+	movs	ip, xh, lsr #22
+	beq	LSYM(Lad_p)
+
+	@ The value is too big.  Scale it down a bit...
+	mov	r2, #3
+	movs	ip, ip, lsr #3
+	do_it	ne
+	addne	r2, r2, #3
+	movs	ip, ip, lsr #3
+	do_it	ne
+	addne	r2, r2, #3
+	add	r2, r2, ip, lsr #3
+
+	rsb	r3, r2, #32
+	shift1	lsl, ip, xl, r3
+	shift1	lsr, xl, xl, r2
+	shiftop orr xl xl xh lsl r3 lr
+	shift1	lsr, xh, xh, r2
+	add	r4, r4, r2
+	b	LSYM(Lad_p)
+
+#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+
+	@ Legacy code expects the result to be returned in f0.  Copy it
+	@ there as well.
+LSYM(f0_ret):
+	do_push	{r0, r1}
+	ldfd	f0, [sp], #8
+	RETLDM
+
+#endif
+
+	FUNC_END floatdidf
+	FUNC_END aeabi_l2d
+	FUNC_END floatundidf
+	FUNC_END aeabi_ul2d
+
+#endif /* L_addsubdf3 */
+
+#ifdef L_arm_muldivdf3
+
+ARM_FUNC_START muldf3
+ARM_FUNC_ALIAS aeabi_dmul muldf3
+	do_push	{r4, r5, r6, lr}
+
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	orr	ip, ip, #0x700
+	ands	r4, ip, xh, lsr #20
+	do_it	ne, tte
+	COND(and,s,ne)	r5, ip, yh, lsr #20
+	teqne	r4, ip
+	teqne	r5, ip
+	bleq	LSYM(Lml_s)
+
+	@ Add exponents together
+	add	r4, r4, r5
+
+	@ Determine final sign.
+	eor	r6, xh, yh
+
+	@ Convert mantissa to unsigned integer.
+	@ If power of two, branch to a separate path.
+	bic	xh, xh, ip, lsl #21
+	bic	yh, yh, ip, lsl #21
+	orrs	r5, xl, xh, lsl #12
+	do_it	ne
+	COND(orr,s,ne)	r5, yl, yh, lsl #12
+	orr	xh, xh, #0x00100000
+	orr	yh, yh, #0x00100000
+	beq	LSYM(Lml_1)
+
+#if __ARM_ARCH__ < 4
+
+	@ Put sign bit in r6, which will be restored in yl later.
+	and   r6, r6, #0x80000000
+
+	@ Well, no way to make it shorter without the umull instruction.
+	stmfd	sp!, {r6, r7, r8, r9, sl, fp}
+	mov	r7, xl, lsr #16
+	mov	r8, yl, lsr #16
+	mov	r9, xh, lsr #16
+	mov	sl, yh, lsr #16
+	bic	xl, xl, r7, lsl #16
+	bic	yl, yl, r8, lsl #16
+	bic	xh, xh, r9, lsl #16
+	bic	yh, yh, sl, lsl #16
+	mul	ip, xl, yl
+	mul	fp, xl, r8
+	mov	lr, #0
+	adds	ip, ip, fp, lsl #16
+	adc	lr, lr, fp, lsr #16
+	mul	fp, r7, yl
+	adds	ip, ip, fp, lsl #16
+	adc	lr, lr, fp, lsr #16
+	mul	fp, xl, sl
+	mov	r5, #0
+	adds	lr, lr, fp, lsl #16
+	adc	r5, r5, fp, lsr #16
+	mul	fp, r7, yh
+	adds	lr, lr, fp, lsl #16
+	adc	r5, r5, fp, lsr #16
+	mul	fp, xh, r8
+	adds	lr, lr, fp, lsl #16
+	adc	r5, r5, fp, lsr #16
+	mul	fp, r9, yl
+	adds	lr, lr, fp, lsl #16
+	adc	r5, r5, fp, lsr #16
+	mul	fp, xh, sl
+	mul	r6, r9, sl
+	adds	r5, r5, fp, lsl #16
+	adc	r6, r6, fp, lsr #16
+	mul	fp, r9, yh
+	adds	r5, r5, fp, lsl #16
+	adc	r6, r6, fp, lsr #16
+	mul	fp, xl, yh
+	adds	lr, lr, fp
+	mul	fp, r7, sl
+	adcs	r5, r5, fp
+	mul	fp, xh, yl
+	adc	r6, r6, #0
+	adds	lr, lr, fp
+	mul	fp, r9, r8
+	adcs	r5, r5, fp
+	mul	fp, r7, r8
+	adc	r6, r6, #0
+	adds	lr, lr, fp
+	mul	fp, xh, yh
+	adcs	r5, r5, fp
+	adc	r6, r6, #0
+	ldmfd	sp!, {yl, r7, r8, r9, sl, fp}
+
+#else
+
+	@ Here is the actual multiplication.
+	umull	ip, lr, xl, yl
+	mov	r5, #0
+	umlal	lr, r5, xh, yl
+	and	yl, r6, #0x80000000
+	umlal	lr, r5, xl, yh
+	mov	r6, #0
+	umlal	r5, r6, xh, yh
+
+#endif
+
+	@ The LSBs in ip are only significant for the final rounding.
+	@ Fold them into lr.
+	teq	ip, #0
+	do_it	ne
+	orrne	lr, lr, #1
+
+	@ Adjust result upon the MSB position.
+	sub	r4, r4, #0xff
+	cmp	r6, #(1 << (20-11))
+	sbc	r4, r4, #0x300
+	bcs	1f
+	movs	lr, lr, lsl #1
+	adcs	r5, r5, r5
+	adc	r6, r6, r6
+1:
+	@ Shift to final position, add sign to result.
+	orr	xh, yl, r6, lsl #11
+	orr	xh, xh, r5, lsr #21
+	mov	xl, r5, lsl #11
+	orr	xl, xl, lr, lsr #21
+	mov	lr, lr, lsl #11
+
+	@ Check exponent range for under/overflow.
+	subs	ip, r4, #(254 - 1)
+	do_it	hi
+	cmphi	ip, #0x700
+	bhi	LSYM(Lml_u)
+
+	@ Round the result, merge final exponent.
+	cmp	lr, #0x80000000
+	do_it	eq
+	COND(mov,s,eq)	lr, xl, lsr #1
+	adcs	xl, xl, #0
+	adc	xh, xh, r4, lsl #20
+	RETLDM	"r4, r5, r6"
+
+	@ Multiplication by 0x1p*: let''s shortcut a lot of code.
+LSYM(Lml_1):
+	and	r6, r6, #0x80000000
+	orr	xh, r6, xh
+	orr	xl, xl, yl
+	eor	xh, xh, yh
+	subs	r4, r4, ip, lsr #1
+	do_it	gt, tt
+	COND(rsb,s,gt)	r5, r4, ip
+	orrgt	xh, xh, r4, lsl #20
+	RETLDM	"r4, r5, r6" gt
+
+	@ Under/overflow: fix things up for the code below.
+	orr	xh, xh, #0x00100000
+	mov	lr, #0
+	subs	r4, r4, #1
+
+LSYM(Lml_u):
+	@ Overflow?
+	bgt	LSYM(Lml_o)
+
+	@ Check if denormalized result is possible, otherwise return signed 0.
+	cmn	r4, #(53 + 1)
+	do_it	le, tt
+	movle	xl, #0
+	bicle	xh, xh, #0x7fffffff
+	RETLDM	"r4, r5, r6" le
+
+	@ Find out proper shift value.
+	rsb	r4, r4, #0
+	subs	r4, r4, #32
+	bge	2f
+	adds	r4, r4, #12
+	bgt	1f
+
+	@ shift result right of 1 to 20 bits, preserve sign bit, round, etc.
+	add	r4, r4, #20
+	rsb	r5, r4, #32
+	shift1	lsl, r3, xl, r5
+	shift1	lsr, xl, xl, r4
+	shiftop orr xl xl xh lsl r5 r2
+	and	r2, xh, #0x80000000
+	bic	xh, xh, #0x80000000
+	adds	xl, xl, r3, lsr #31
+	shiftop adc xh r2 xh lsr r4 r6
+	orrs	lr, lr, r3, lsl #1
+	do_it	eq
+	biceq	xl, xl, r3, lsr #31
+	RETLDM	"r4, r5, r6"
+
+	@ shift result right of 21 to 31 bits, or left 11 to 1 bits after
+	@ a register switch from xh to xl. Then round.
+1:	rsb	r4, r4, #12
+	rsb	r5, r4, #32
+	shift1	lsl, r3, xl, r4
+	shift1	lsr, xl, xl, r5
+	shiftop orr xl xl xh lsl r4 r2
+	bic	xh, xh, #0x7fffffff
+	adds	xl, xl, r3, lsr #31
+	adc	xh, xh, #0
+	orrs	lr, lr, r3, lsl #1
+	do_it	eq
+	biceq	xl, xl, r3, lsr #31
+	RETLDM	"r4, r5, r6"
+
+	@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
+	@ from xh to xl.  Leftover bits are in r3-r6-lr for rounding.
+2:	rsb	r5, r4, #32
+	shiftop orr lr lr xl lsl r5 r2
+	shift1	lsr, r3, xl, r4
+	shiftop orr r3 r3 xh lsl r5 r2
+	shift1	lsr, xl, xh, r4
+	bic	xh, xh, #0x7fffffff
+	shiftop bic xl xl xh lsr r4 r2
+	add	xl, xl, r3, lsr #31
+	orrs	lr, lr, r3, lsl #1
+	do_it	eq
+	biceq	xl, xl, r3, lsr #31
+	RETLDM	"r4, r5, r6"
+
+	@ One or both arguments are denormalized.
+	@ Scale them leftwards and preserve sign bit.
+LSYM(Lml_d):
+	teq	r4, #0
+	bne	2f
+	and	r6, xh, #0x80000000
+1:	movs	xl, xl, lsl #1
+	adc	xh, xh, xh
+	tst	xh, #0x00100000
+	do_it	eq
+	subeq	r4, r4, #1
+	beq	1b
+	orr	xh, xh, r6
+	teq	r5, #0
+	do_it	ne
+	RETc(ne)
+2:	and	r6, yh, #0x80000000
+3:	movs	yl, yl, lsl #1
+	adc	yh, yh, yh
+	tst	yh, #0x00100000
+	do_it	eq
+	subeq	r5, r5, #1
+	beq	3b
+	orr	yh, yh, r6
+	RET
+
+LSYM(Lml_s):
+	@ Isolate the INF and NAN cases away
+	teq	r4, ip
+	and	r5, ip, yh, lsr #20
+	do_it	ne
+	teqne	r5, ip
+	beq	1f
+
+	@ Here, one or more arguments are either denormalized or zero.
+	orrs	r6, xl, xh, lsl #1
+	do_it	ne
+	COND(orr,s,ne)	r6, yl, yh, lsl #1
+	bne	LSYM(Lml_d)
+
+	@ Result is 0, but determine sign anyway.
+LSYM(Lml_z):
+	eor	xh, xh, yh
+	and	xh, xh, #0x80000000
+	mov	xl, #0
+	RETLDM	"r4, r5, r6"
+
+1:	@ One or both args are INF or NAN.
+	orrs	r6, xl, xh, lsl #1
+	do_it	eq, te
+	moveq	xl, yl
+	moveq	xh, yh
+	COND(orr,s,ne)	r6, yl, yh, lsl #1
+	beq	LSYM(Lml_n)		@ 0 * INF or INF * 0 -> NAN
+	teq	r4, ip
+	bne	1f
+	orrs	r6, xl, xh, lsl #12
+	bne	LSYM(Lml_n)		@ NAN * <anything> -> NAN
+1:	teq	r5, ip
+	bne	LSYM(Lml_i)
+	orrs	r6, yl, yh, lsl #12
+	do_it	ne, t
+	movne	xl, yl
+	movne	xh, yh
+	bne	LSYM(Lml_n)		@ <anything> * NAN -> NAN
+
+	@ Result is INF, but we need to determine its sign.
+LSYM(Lml_i):
+	eor	xh, xh, yh
+
+	@ Overflow: return INF (sign already in xh).
+LSYM(Lml_o):
+	and	xh, xh, #0x80000000
+	orr	xh, xh, #0x7f000000
+	orr	xh, xh, #0x00f00000
+	mov	xl, #0
+	RETLDM	"r4, r5, r6"
+
+	@ Return a quiet NAN.
+LSYM(Lml_n):
+	orr	xh, xh, #0x7f000000
+	orr	xh, xh, #0x00f80000
+	RETLDM	"r4, r5, r6"
+
+	FUNC_END aeabi_dmul
+	FUNC_END muldf3
+
+ARM_FUNC_START divdf3
+ARM_FUNC_ALIAS aeabi_ddiv divdf3
+	
+	do_push	{r4, r5, r6, lr}
+
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	orr	ip, ip, #0x700
+	ands	r4, ip, xh, lsr #20
+	do_it	ne, tte
+	COND(and,s,ne)	r5, ip, yh, lsr #20
+	teqne	r4, ip
+	teqne	r5, ip
+	bleq	LSYM(Ldv_s)
+
+	@ Substract divisor exponent from dividend''s.
+	sub	r4, r4, r5
+
+	@ Preserve final sign into lr.
+	eor	lr, xh, yh
+
+	@ Convert mantissa to unsigned integer.
+	@ Dividend -> r5-r6, divisor -> yh-yl.
+	orrs	r5, yl, yh, lsl #12
+	mov	xh, xh, lsl #12
+	beq	LSYM(Ldv_1)
+	mov	yh, yh, lsl #12
+	mov	r5, #0x10000000
+	orr	yh, r5, yh, lsr #4
+	orr	yh, yh, yl, lsr #24
+	mov	yl, yl, lsl #8
+	orr	r5, r5, xh, lsr #4
+	orr	r5, r5, xl, lsr #24
+	mov	r6, xl, lsl #8
+
+	@ Initialize xh with final sign bit.
+	and	xh, lr, #0x80000000
+
+	@ Ensure result will land to known bit position.
+	@ Apply exponent bias accordingly.
+	cmp	r5, yh
+	do_it	eq
+	cmpeq	r6, yl
+	adc	r4, r4, #(255 - 2)
+	add	r4, r4, #0x300
+	bcs	1f
+	movs	yh, yh, lsr #1
+	mov	yl, yl, rrx
+1:
+	@ Perform first substraction to align result to a nibble.
+	subs	r6, r6, yl
+	sbc	r5, r5, yh
+	movs	yh, yh, lsr #1
+	mov	yl, yl, rrx
+	mov	xl, #0x00100000
+	mov	ip, #0x00080000
+
+	@ The actual division loop.
+1:	subs	lr, r6, yl
+	sbcs	lr, r5, yh
+	do_it	cs, tt
+	subcs	r6, r6, yl
+	movcs	r5, lr
+	orrcs	xl, xl, ip
+	movs	yh, yh, lsr #1
+	mov	yl, yl, rrx
+	subs	lr, r6, yl
+	sbcs	lr, r5, yh
+	do_it	cs, tt
+	subcs	r6, r6, yl
+	movcs	r5, lr
+	orrcs	xl, xl, ip, lsr #1
+	movs	yh, yh, lsr #1
+	mov	yl, yl, rrx
+	subs	lr, r6, yl
+	sbcs	lr, r5, yh
+	do_it	cs, tt
+	subcs	r6, r6, yl
+	movcs	r5, lr
+	orrcs	xl, xl, ip, lsr #2
+	movs	yh, yh, lsr #1
+	mov	yl, yl, rrx
+	subs	lr, r6, yl
+	sbcs	lr, r5, yh
+	do_it	cs, tt
+	subcs	r6, r6, yl
+	movcs	r5, lr
+	orrcs	xl, xl, ip, lsr #3
+
+	orrs	lr, r5, r6
+	beq	2f
+	mov	r5, r5, lsl #4
+	orr	r5, r5, r6, lsr #28
+	mov	r6, r6, lsl #4
+	mov	yh, yh, lsl #3
+	orr	yh, yh, yl, lsr #29
+	mov	yl, yl, lsl #3
+	movs	ip, ip, lsr #4
+	bne	1b
+
+	@ We are done with a word of the result.
+	@ Loop again for the low word if this pass was for the high word.
+	tst	xh, #0x00100000
+	bne	3f
+	orr	xh, xh, xl
+	mov	xl, #0
+	mov	ip, #0x80000000
+	b	1b
+2:
+	@ Be sure result starts in the high word.
+	tst	xh, #0x00100000
+	do_it	eq, t
+	orreq	xh, xh, xl
+	moveq	xl, #0
+3:
+	@ Check exponent range for under/overflow.
+	subs	ip, r4, #(254 - 1)
+	do_it	hi
+	cmphi	ip, #0x700
+	bhi	LSYM(Lml_u)
+
+	@ Round the result, merge final exponent.
+	subs	ip, r5, yh
+	do_it	eq, t
+	COND(sub,s,eq)	ip, r6, yl
+	COND(mov,s,eq)	ip, xl, lsr #1
+	adcs	xl, xl, #0
+	adc	xh, xh, r4, lsl #20
+	RETLDM	"r4, r5, r6"
+
+	@ Division by 0x1p*: shortcut a lot of code.
+LSYM(Ldv_1):
+	and	lr, lr, #0x80000000
+	orr	xh, lr, xh, lsr #12
+	adds	r4, r4, ip, lsr #1
+	do_it	gt, tt
+	COND(rsb,s,gt)	r5, r4, ip
+	orrgt	xh, xh, r4, lsl #20
+	RETLDM	"r4, r5, r6" gt
+
+	orr	xh, xh, #0x00100000
+	mov	lr, #0
+	subs	r4, r4, #1
+	b	LSYM(Lml_u)
+
+	@ Result mightt need to be denormalized: put remainder bits
+	@ in lr for rounding considerations.
+LSYM(Ldv_u):
+	orr	lr, r5, r6
+	b	LSYM(Lml_u)
+
+	@ One or both arguments is either INF, NAN or zero.
+LSYM(Ldv_s):
+	and	r5, ip, yh, lsr #20
+	teq	r4, ip
+	do_it	eq
+	teqeq	r5, ip
+	beq	LSYM(Lml_n)		@ INF/NAN / INF/NAN -> NAN
+	teq	r4, ip
+	bne	1f
+	orrs	r4, xl, xh, lsl #12
+	bne	LSYM(Lml_n)		@ NAN / <anything> -> NAN
+	teq	r5, ip
+	bne	LSYM(Lml_i)		@ INF / <anything> -> INF
+	mov	xl, yl
+	mov	xh, yh
+	b	LSYM(Lml_n)		@ INF / (INF or NAN) -> NAN
+1:	teq	r5, ip
+	bne	2f
+	orrs	r5, yl, yh, lsl #12
+	beq	LSYM(Lml_z)		@ <anything> / INF -> 0
+	mov	xl, yl
+	mov	xh, yh
+	b	LSYM(Lml_n)		@ <anything> / NAN -> NAN
+2:	@ If both are nonzero, we need to normalize and resume above.
+	orrs	r6, xl, xh, lsl #1
+	do_it	ne
+	COND(orr,s,ne)	r6, yl, yh, lsl #1
+	bne	LSYM(Lml_d)
+	@ One or both arguments are 0.
+	orrs	r4, xl, xh, lsl #1
+	bne	LSYM(Lml_i)		@ <non_zero> / 0 -> INF
+	orrs	r5, yl, yh, lsl #1
+	bne	LSYM(Lml_z)		@ 0 / <non_zero> -> 0
+	b	LSYM(Lml_n)		@ 0 / 0 -> NAN
+
+	FUNC_END aeabi_ddiv
+	FUNC_END divdf3
+
+#endif /* L_muldivdf3 */
+
+#ifdef L_arm_cmpdf2
+
+@ Note: only r0 (return value) and ip are clobbered here.
+
+ARM_FUNC_START gtdf2
+ARM_FUNC_ALIAS gedf2 gtdf2
+	mov	ip, #-1
+	b	1f
+
+ARM_FUNC_START ltdf2
+ARM_FUNC_ALIAS ledf2 ltdf2
+	mov	ip, #1
+	b	1f
+
+ARM_FUNC_START cmpdf2
+ARM_FUNC_ALIAS nedf2 cmpdf2
+ARM_FUNC_ALIAS eqdf2 cmpdf2
+	mov	ip, #1			@ how should we specify unordered here?
+
+1:	str	ip, [sp, #-4]!
+
+	@ Trap any INF/NAN first.
+	mov	ip, xh, lsl #1
+	mvns	ip, ip, asr #21
+	mov	ip, yh, lsl #1
+	do_it	ne
+	COND(mvn,s,ne)	ip, ip, asr #21
+	beq	3f
+
+	@ Test for equality.
+	@ Note that 0.0 is equal to -0.0.
+2:	add	sp, sp, #4
+	orrs	ip, xl, xh, lsl #1	@ if x == 0.0 or -0.0
+	do_it	eq, e
+	COND(orr,s,eq)	ip, yl, yh, lsl #1	@ and y == 0.0 or -0.0
+	teqne	xh, yh			@ or xh == yh
+	do_it	eq, tt
+	teqeq	xl, yl			@ and xl == yl
+	moveq	r0, #0			@ then equal.
+	RETc(eq)
+
+	@ Clear C flag
+	cmn	r0, #0
+
+	@ Compare sign, 
+	teq	xh, yh
+
+	@ Compare values if same sign
+	do_it	pl
+	cmppl	xh, yh
+	do_it	eq
+	cmpeq	xl, yl
+
+	@ Result:
+	do_it	cs, e
+	movcs	r0, yh, asr #31
+	mvncc	r0, yh, asr #31
+	orr	r0, r0, #1
+	RET
+
+	@ Look for a NAN.
+3:	mov	ip, xh, lsl #1
+	mvns	ip, ip, asr #21
+	bne	4f
+	orrs	ip, xl, xh, lsl #12
+	bne	5f			@ x is NAN
+4:	mov	ip, yh, lsl #1
+	mvns	ip, ip, asr #21
+	bne	2b
+	orrs	ip, yl, yh, lsl #12
+	beq	2b			@ y is not NAN
+5:	ldr	r0, [sp], #4		@ unordered return code
+	RET
+
+	FUNC_END gedf2
+	FUNC_END gtdf2
+	FUNC_END ledf2
+	FUNC_END ltdf2
+	FUNC_END nedf2
+	FUNC_END eqdf2
+	FUNC_END cmpdf2
+
+ARM_FUNC_START aeabi_cdrcmple
+
+	mov	ip, r0
+	mov	r0, r2
+	mov	r2, ip
+	mov	ip, r1
+	mov	r1, r3
+	mov	r3, ip
+	b	6f
+	
+ARM_FUNC_START aeabi_cdcmpeq
+ARM_FUNC_ALIAS aeabi_cdcmple aeabi_cdcmpeq
+
+	@ The status-returning routines are required to preserve all
+	@ registers except ip, lr, and cpsr.
+6:	do_push	{r0, lr}
+	ARM_CALL cmpdf2
+	@ Set the Z flag correctly, and the C flag unconditionally.
+	cmp	r0, #0
+	@ Clear the C flag if the return value was -1, indicating
+	@ that the first operand was smaller than the second.
+	do_it	mi
+	cmnmi	r0, #0
+	RETLDM	"r0"
+
+	FUNC_END aeabi_cdcmple
+	FUNC_END aeabi_cdcmpeq
+	FUNC_END aeabi_cdrcmple
+	
+ARM_FUNC_START	aeabi_dcmpeq
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cdcmple
+	do_it	eq, e
+	moveq	r0, #1	@ Equal to.
+	movne	r0, #0	@ Less than, greater than, or unordered.
+	RETLDM
+
+	FUNC_END aeabi_dcmpeq
+
+ARM_FUNC_START	aeabi_dcmplt
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cdcmple
+	do_it	cc, e
+	movcc	r0, #1	@ Less than.
+	movcs	r0, #0	@ Equal to, greater than, or unordered.
+	RETLDM
+
+	FUNC_END aeabi_dcmplt
+
+ARM_FUNC_START	aeabi_dcmple
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cdcmple
+	do_it	ls, e
+	movls	r0, #1  @ Less than or equal to.
+	movhi	r0, #0	@ Greater than or unordered.
+	RETLDM
+
+	FUNC_END aeabi_dcmple
+
+ARM_FUNC_START	aeabi_dcmpge
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cdrcmple
+	do_it	ls, e
+	movls	r0, #1	@ Operand 2 is less than or equal to operand 1.
+	movhi	r0, #0	@ Operand 2 greater than operand 1, or unordered.
+	RETLDM
+
+	FUNC_END aeabi_dcmpge
+
+ARM_FUNC_START	aeabi_dcmpgt
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cdrcmple
+	do_it	cc, e
+	movcc	r0, #1	@ Operand 2 is less than operand 1.
+	movcs	r0, #0  @ Operand 2 is greater than or equal to operand 1,
+			@ or they are unordered.
+	RETLDM
+
+	FUNC_END aeabi_dcmpgt
+
+#endif /* L_cmpdf2 */
+
+#ifdef L_arm_unorddf2
+
+ARM_FUNC_START unorddf2
+ARM_FUNC_ALIAS aeabi_dcmpun unorddf2
+
+	mov	ip, xh, lsl #1
+	mvns	ip, ip, asr #21
+	bne	1f
+	orrs	ip, xl, xh, lsl #12
+	bne	3f			@ x is NAN
+1:	mov	ip, yh, lsl #1
+	mvns	ip, ip, asr #21
+	bne	2f
+	orrs	ip, yl, yh, lsl #12
+	bne	3f			@ y is NAN
+2:	mov	r0, #0			@ arguments are ordered.
+	RET
+
+3:	mov	r0, #1			@ arguments are unordered.
+	RET
+
+	FUNC_END aeabi_dcmpun
+	FUNC_END unorddf2
+
+#endif /* L_unorddf2 */
+
+#ifdef L_arm_fixdfsi
+
+ARM_FUNC_START fixdfsi
+ARM_FUNC_ALIAS aeabi_d2iz fixdfsi
+
+	@ check exponent range.
+	mov	r2, xh, lsl #1
+	adds	r2, r2, #(1 << 21)
+	bcs	2f			@ value is INF or NAN
+	bpl	1f			@ value is too small
+	mov	r3, #(0xfffffc00 + 31)
+	subs	r2, r3, r2, asr #21
+	bls	3f			@ value is too large
+
+	@ scale value
+	mov	r3, xh, lsl #11
+	orr	r3, r3, #0x80000000
+	orr	r3, r3, xl, lsr #21
+	tst	xh, #0x80000000		@ the sign bit
+	shift1	lsr, r0, r3, r2
+	do_it	ne
+	rsbne	r0, r0, #0
+	RET
+
+1:	mov	r0, #0
+	RET
+
+2:	orrs	xl, xl, xh, lsl #12
+	bne	4f			@ x is NAN.
+3:	ands	r0, xh, #0x80000000	@ the sign bit
+	do_it	eq
+	moveq	r0, #0x7fffffff		@ maximum signed positive si
+	RET
+
+4:	mov	r0, #0			@ How should we convert NAN?
+	RET
+
+	FUNC_END aeabi_d2iz
+	FUNC_END fixdfsi
+
+#endif /* L_fixdfsi */
+
+#ifdef L_arm_fixunsdfsi
+
+ARM_FUNC_START fixunsdfsi
+ARM_FUNC_ALIAS aeabi_d2uiz fixunsdfsi
+
+	@ check exponent range.
+	movs	r2, xh, lsl #1
+	bcs	1f			@ value is negative
+	adds	r2, r2, #(1 << 21)
+	bcs	2f			@ value is INF or NAN
+	bpl	1f			@ value is too small
+	mov	r3, #(0xfffffc00 + 31)
+	subs	r2, r3, r2, asr #21
+	bmi	3f			@ value is too large
+
+	@ scale value
+	mov	r3, xh, lsl #11
+	orr	r3, r3, #0x80000000
+	orr	r3, r3, xl, lsr #21
+	shift1	lsr, r0, r3, r2
+	RET
+
+1:	mov	r0, #0
+	RET
+
+2:	orrs	xl, xl, xh, lsl #12
+	bne	4f			@ value is NAN.
+3:	mov	r0, #0xffffffff		@ maximum unsigned si
+	RET
+
+4:	mov	r0, #0			@ How should we convert NAN?
+	RET
+
+	FUNC_END aeabi_d2uiz
+	FUNC_END fixunsdfsi
+
+#endif /* L_fixunsdfsi */
+
+#ifdef L_arm_truncdfsf2
+
+ARM_FUNC_START truncdfsf2
+ARM_FUNC_ALIAS aeabi_d2f truncdfsf2
+
+	@ check exponent range.
+	mov	r2, xh, lsl #1
+	subs	r3, r2, #((1023 - 127) << 21)
+	do_it	cs, t
+	COND(sub,s,cs)	ip, r3, #(1 << 21)
+	COND(rsb,s,cs)	ip, ip, #(254 << 21)
+	bls	2f			@ value is out of range
+
+1:	@ shift and round mantissa
+	and	ip, xh, #0x80000000
+	mov	r2, xl, lsl #3
+	orr	xl, ip, xl, lsr #29
+	cmp	r2, #0x80000000
+	adc	r0, xl, r3, lsl #2
+	do_it	eq
+	biceq	r0, r0, #1
+	RET
+
+2:	@ either overflow or underflow
+	tst	xh, #0x40000000
+	bne	3f			@ overflow
+
+	@ check if denormalized value is possible
+	adds	r2, r3, #(23 << 21)
+	do_it	lt, t
+	andlt	r0, xh, #0x80000000	@ too small, return signed 0.
+	RETc(lt)
+
+	@ denormalize value so we can resume with the code above afterwards.
+	orr	xh, xh, #0x00100000
+	mov	r2, r2, lsr #21
+	rsb	r2, r2, #24
+	rsb	ip, r2, #32
+#if defined(__thumb2__)
+	lsls	r3, xl, ip
+#else
+	movs	r3, xl, lsl ip
+#endif
+	shift1	lsr, xl, xl, r2
+	do_it	ne
+	orrne	xl, xl, #1		@ fold r3 for rounding considerations. 
+	mov	r3, xh, lsl #11
+	mov	r3, r3, lsr #11
+	shiftop orr xl xl r3 lsl ip ip
+	shift1	lsr, r3, r3, r2
+	mov	r3, r3, lsl #1
+	b	1b
+
+3:	@ chech for NAN
+	mvns	r3, r2, asr #21
+	bne	5f			@ simple overflow
+	orrs	r3, xl, xh, lsl #12
+	do_it	ne, tt
+	movne	r0, #0x7f000000
+	orrne	r0, r0, #0x00c00000
+	RETc(ne)			@ return NAN
+
+5:	@ return INF with sign
+	and	r0, xh, #0x80000000
+	orr	r0, r0, #0x7f000000
+	orr	r0, r0, #0x00800000
+	RET
+
+	FUNC_END aeabi_d2f
+	FUNC_END truncdfsf2
+
+#endif /* L_truncdfsf2 */
diff --git a/libgcc/config/arm/ieee754-sf.S b/libgcc/config/arm/ieee754-sf.S
new file mode 100644
index 00000000000..c93f66d8ff8
--- /dev/null
+++ b/libgcc/config/arm/ieee754-sf.S
@@ -0,0 +1,1060 @@
+/* ieee754-sf.S single-precision floating point support for ARM
+
+   Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009  Free Software Foundation, Inc.
+   Contributed by Nicolas Pitre (nico@cam.org)
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/*
+ * Notes:
+ *
+ * The goal of this code is to be as fast as possible.  This is
+ * not meant to be easy to understand for the casual reader.
+ *
+ * Only the default rounding mode is intended for best performances.
+ * Exceptions aren't supported yet, but that can be added quite easily
+ * if necessary without impacting performances.
+ */
+
+#ifdef L_arm_negsf2
+	
+ARM_FUNC_START negsf2
+ARM_FUNC_ALIAS aeabi_fneg negsf2
+
+	eor	r0, r0, #0x80000000	@ flip sign bit
+	RET
+
+	FUNC_END aeabi_fneg
+	FUNC_END negsf2
+
+#endif
+
+#ifdef L_arm_addsubsf3
+
+ARM_FUNC_START aeabi_frsub
+
+	eor	r0, r0, #0x80000000	@ flip sign bit of first arg
+	b	1f
+
+ARM_FUNC_START subsf3
+ARM_FUNC_ALIAS aeabi_fsub subsf3
+
+	eor	r1, r1, #0x80000000	@ flip sign bit of second arg
+#if defined(__INTERWORKING_STUBS__)
+	b	1f			@ Skip Thumb-code prologue
+#endif
+
+ARM_FUNC_START addsf3
+ARM_FUNC_ALIAS aeabi_fadd addsf3
+
+1:	@ Look for zeroes, equal values, INF, or NAN.
+	movs	r2, r0, lsl #1
+	do_it	ne, ttt
+	COND(mov,s,ne)	r3, r1, lsl #1
+	teqne	r2, r3
+	COND(mvn,s,ne)	ip, r2, asr #24
+	COND(mvn,s,ne)	ip, r3, asr #24
+	beq	LSYM(Lad_s)
+
+	@ Compute exponent difference.  Make largest exponent in r2,
+	@ corresponding arg in r0, and positive exponent difference in r3.
+	mov	r2, r2, lsr #24
+	rsbs	r3, r2, r3, lsr #24
+	do_it	gt, ttt
+	addgt	r2, r2, r3
+	eorgt	r1, r0, r1
+	eorgt	r0, r1, r0
+	eorgt	r1, r0, r1
+	do_it	lt
+	rsblt	r3, r3, #0
+
+	@ If exponent difference is too large, return largest argument
+	@ already in r0.  We need up to 25 bit to handle proper rounding
+	@ of 0x1p25 - 1.1.
+	cmp	r3, #25
+	do_it	hi
+	RETc(hi)
+
+	@ Convert mantissa to signed integer.
+	tst	r0, #0x80000000
+	orr	r0, r0, #0x00800000
+	bic	r0, r0, #0xff000000
+	do_it	ne
+	rsbne	r0, r0, #0
+	tst	r1, #0x80000000
+	orr	r1, r1, #0x00800000
+	bic	r1, r1, #0xff000000
+	do_it	ne
+	rsbne	r1, r1, #0
+
+	@ If exponent == difference, one or both args were denormalized.
+	@ Since this is not common case, rescale them off line.
+	teq	r2, r3
+	beq	LSYM(Lad_d)
+LSYM(Lad_x):
+
+	@ Compensate for the exponent overlapping the mantissa MSB added later
+	sub	r2, r2, #1
+
+	@ Shift and add second arg to first arg in r0.
+	@ Keep leftover bits into r1.
+	shiftop adds r0 r0 r1 asr r3 ip
+	rsb	r3, r3, #32
+	shift1	lsl, r1, r1, r3
+
+	@ Keep absolute value in r0-r1, sign in r3 (the n bit was set above)
+	and	r3, r0, #0x80000000
+	bpl	LSYM(Lad_p)
+#if defined(__thumb2__)
+	negs	r1, r1
+	sbc	r0, r0, r0, lsl #1
+#else
+	rsbs	r1, r1, #0
+	rsc	r0, r0, #0
+#endif
+
+	@ Determine how to normalize the result.
+LSYM(Lad_p):
+	cmp	r0, #0x00800000
+	bcc	LSYM(Lad_a)
+	cmp	r0, #0x01000000
+	bcc	LSYM(Lad_e)
+
+	@ Result needs to be shifted right.
+	movs	r0, r0, lsr #1
+	mov	r1, r1, rrx
+	add	r2, r2, #1
+
+	@ Make sure we did not bust our exponent.
+	cmp	r2, #254
+	bhs	LSYM(Lad_o)
+
+	@ Our result is now properly aligned into r0, remaining bits in r1.
+	@ Pack final result together.
+	@ Round with MSB of r1. If halfway between two numbers, round towards
+	@ LSB of r0 = 0. 
+LSYM(Lad_e):
+	cmp	r1, #0x80000000
+	adc	r0, r0, r2, lsl #23
+	do_it	eq
+	biceq	r0, r0, #1
+	orr	r0, r0, r3
+	RET
+
+	@ Result must be shifted left and exponent adjusted.
+LSYM(Lad_a):
+	movs	r1, r1, lsl #1
+	adc	r0, r0, r0
+	tst	r0, #0x00800000
+	sub	r2, r2, #1
+	bne	LSYM(Lad_e)
+	
+	@ No rounding necessary since r1 will always be 0 at this point.
+LSYM(Lad_l):
+
+#if __ARM_ARCH__ < 5
+
+	movs	ip, r0, lsr #12
+	moveq	r0, r0, lsl #12
+	subeq	r2, r2, #12
+	tst	r0, #0x00ff0000
+	moveq	r0, r0, lsl #8
+	subeq	r2, r2, #8
+	tst	r0, #0x00f00000
+	moveq	r0, r0, lsl #4
+	subeq	r2, r2, #4
+	tst	r0, #0x00c00000
+	moveq	r0, r0, lsl #2
+	subeq	r2, r2, #2
+	cmp	r0, #0x00800000
+	movcc	r0, r0, lsl #1
+	sbcs	r2, r2, #0
+
+#else
+
+	clz	ip, r0
+	sub	ip, ip, #8
+	subs	r2, r2, ip
+	shift1	lsl, r0, r0, ip
+
+#endif
+
+	@ Final result with sign
+	@ If exponent negative, denormalize result.
+	do_it	ge, et
+	addge	r0, r0, r2, lsl #23
+	rsblt	r2, r2, #0
+	orrge	r0, r0, r3
+#if defined(__thumb2__)
+	do_it	lt, t
+	lsrlt	r0, r0, r2
+	orrlt	r0, r3, r0
+#else
+	orrlt	r0, r3, r0, lsr r2
+#endif
+	RET
+
+	@ Fixup and adjust bit position for denormalized arguments.
+	@ Note that r2 must not remain equal to 0.
+LSYM(Lad_d):
+	teq	r2, #0
+	eor	r1, r1, #0x00800000
+	do_it	eq, te
+	eoreq	r0, r0, #0x00800000
+	addeq	r2, r2, #1
+	subne	r3, r3, #1
+	b	LSYM(Lad_x)
+
+LSYM(Lad_s):
+	mov	r3, r1, lsl #1
+
+	mvns	ip, r2, asr #24
+	do_it	ne
+	COND(mvn,s,ne)	ip, r3, asr #24
+	beq	LSYM(Lad_i)
+
+	teq	r2, r3
+	beq	1f
+
+	@ Result is x + 0.0 = x or 0.0 + y = y.
+	teq	r2, #0
+	do_it	eq
+	moveq	r0, r1
+	RET
+
+1:	teq	r0, r1
+
+	@ Result is x - x = 0.
+	do_it	ne, t
+	movne	r0, #0
+	RETc(ne)
+
+	@ Result is x + x = 2x.
+	tst	r2, #0xff000000
+	bne	2f
+	movs	r0, r0, lsl #1
+	do_it	cs
+	orrcs	r0, r0, #0x80000000
+	RET
+2:	adds	r2, r2, #(2 << 24)
+	do_it	cc, t
+	addcc	r0, r0, #(1 << 23)
+	RETc(cc)
+	and	r3, r0, #0x80000000
+
+	@ Overflow: return INF.
+LSYM(Lad_o):
+	orr	r0, r3, #0x7f000000
+	orr	r0, r0, #0x00800000
+	RET
+
+	@ At least one of r0/r1 is INF/NAN.
+	@   if r0 != INF/NAN: return r1 (which is INF/NAN)
+	@   if r1 != INF/NAN: return r0 (which is INF/NAN)
+	@   if r0 or r1 is NAN: return NAN
+	@   if opposite sign: return NAN
+	@   otherwise return r0 (which is INF or -INF)
+LSYM(Lad_i):
+	mvns	r2, r2, asr #24
+	do_it	ne, et
+	movne	r0, r1
+	COND(mvn,s,eq)	r3, r3, asr #24
+	movne	r1, r0
+	movs	r2, r0, lsl #9
+	do_it	eq, te
+	COND(mov,s,eq)	r3, r1, lsl #9
+	teqeq	r0, r1
+	orrne	r0, r0, #0x00400000	@ quiet NAN
+	RET
+
+	FUNC_END aeabi_frsub
+	FUNC_END aeabi_fadd
+	FUNC_END addsf3
+	FUNC_END aeabi_fsub
+	FUNC_END subsf3
+
+ARM_FUNC_START floatunsisf
+ARM_FUNC_ALIAS aeabi_ui2f floatunsisf
+		
+	mov	r3, #0
+	b	1f
+
+ARM_FUNC_START floatsisf
+ARM_FUNC_ALIAS aeabi_i2f floatsisf
+	
+	ands	r3, r0, #0x80000000
+	do_it	mi
+	rsbmi	r0, r0, #0
+
+1:	movs	ip, r0
+	do_it	eq
+	RETc(eq)
+
+	@ Add initial exponent to sign
+	orr	r3, r3, #((127 + 23) << 23)
+
+	.ifnc	ah, r0
+	mov	ah, r0
+	.endif
+	mov	al, #0
+	b	2f
+
+	FUNC_END aeabi_i2f
+	FUNC_END floatsisf
+	FUNC_END aeabi_ui2f
+	FUNC_END floatunsisf
+
+ARM_FUNC_START floatundisf
+ARM_FUNC_ALIAS aeabi_ul2f floatundisf
+
+	orrs	r2, r0, r1
+#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+	do_it	eq, t
+	mvfeqs	f0, #0.0
+#else
+	do_it	eq
+#endif
+	RETc(eq)
+
+	mov	r3, #0
+	b	1f
+
+ARM_FUNC_START floatdisf
+ARM_FUNC_ALIAS aeabi_l2f floatdisf
+
+	orrs	r2, r0, r1
+#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+	do_it	eq, t
+	mvfeqs	f0, #0.0
+#else
+	do_it	eq
+#endif
+	RETc(eq)
+
+	ands	r3, ah, #0x80000000	@ sign bit in r3
+	bpl	1f
+#if defined(__thumb2__)
+	negs	al, al
+	sbc	ah, ah, ah, lsl #1
+#else
+	rsbs	al, al, #0
+	rsc	ah, ah, #0
+#endif
+1:
+#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+	@ For hard FPA code we want to return via the tail below so that
+	@ we can return the result in f0 as well as in r0 for backwards
+	@ compatibility.
+	str	lr, [sp, #-8]!
+	adr	lr, LSYM(f0_ret)
+#endif
+
+	movs	ip, ah
+	do_it	eq, tt
+	moveq	ip, al
+	moveq	ah, al
+	moveq	al, #0
+
+	@ Add initial exponent to sign
+	orr	r3, r3, #((127 + 23 + 32) << 23)
+	do_it	eq
+	subeq	r3, r3, #(32 << 23)
+2:	sub	r3, r3, #(1 << 23)
+
+#if __ARM_ARCH__ < 5
+
+	mov	r2, #23
+	cmp	ip, #(1 << 16)
+	do_it	hs, t
+	movhs	ip, ip, lsr #16
+	subhs	r2, r2, #16
+	cmp	ip, #(1 << 8)
+	do_it	hs, t
+	movhs	ip, ip, lsr #8
+	subhs	r2, r2, #8
+	cmp	ip, #(1 << 4)
+	do_it	hs, t
+	movhs	ip, ip, lsr #4
+	subhs	r2, r2, #4
+	cmp	ip, #(1 << 2)
+	do_it	hs, e
+	subhs	r2, r2, #2
+	sublo	r2, r2, ip, lsr #1
+	subs	r2, r2, ip, lsr #3
+
+#else
+
+	clz	r2, ip
+	subs	r2, r2, #8
+
+#endif
+
+	sub	r3, r3, r2, lsl #23
+	blt	3f
+
+	shiftop add r3 r3 ah lsl r2 ip
+	shift1	lsl, ip, al, r2
+	rsb	r2, r2, #32
+	cmp	ip, #0x80000000
+	shiftop adc r0 r3 al lsr r2 r2
+	do_it	eq
+	biceq	r0, r0, #1
+	RET
+
+3:	add	r2, r2, #32
+	shift1	lsl, ip, ah, r2
+	rsb	r2, r2, #32
+	orrs	al, al, ip, lsl #1
+	shiftop adc r0 r3 ah lsr r2 r2
+	do_it	eq
+	biceq	r0, r0, ip, lsr #31
+	RET
+
+#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+
+LSYM(f0_ret):
+	str	r0, [sp, #-4]!
+	ldfs	f0, [sp], #4
+	RETLDM
+
+#endif
+
+	FUNC_END floatdisf
+	FUNC_END aeabi_l2f
+	FUNC_END floatundisf
+	FUNC_END aeabi_ul2f
+
+#endif /* L_addsubsf3 */
+
+#ifdef L_arm_muldivsf3
+
+ARM_FUNC_START mulsf3
+ARM_FUNC_ALIAS aeabi_fmul mulsf3
+
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	ands	r2, ip, r0, lsr #23
+	do_it	ne, tt
+	COND(and,s,ne)	r3, ip, r1, lsr #23
+	teqne	r2, ip
+	teqne	r3, ip
+	beq	LSYM(Lml_s)
+LSYM(Lml_x):
+
+	@ Add exponents together
+	add	r2, r2, r3
+
+	@ Determine final sign.
+	eor	ip, r0, r1
+
+	@ Convert mantissa to unsigned integer.
+	@ If power of two, branch to a separate path.
+	@ Make up for final alignment.
+	movs	r0, r0, lsl #9
+	do_it	ne
+	COND(mov,s,ne)	r1, r1, lsl #9
+	beq	LSYM(Lml_1)
+	mov	r3, #0x08000000
+	orr	r0, r3, r0, lsr #5
+	orr	r1, r3, r1, lsr #5
+
+#if __ARM_ARCH__ < 4
+
+	@ Put sign bit in r3, which will be restored into r0 later.
+	and	r3, ip, #0x80000000
+
+	@ Well, no way to make it shorter without the umull instruction.
+	do_push	{r3, r4, r5}
+	mov	r4, r0, lsr #16
+	mov	r5, r1, lsr #16
+	bic	r0, r0, r4, lsl #16
+	bic	r1, r1, r5, lsl #16
+	mul	ip, r4, r5
+	mul	r3, r0, r1
+	mul	r0, r5, r0
+	mla	r0, r4, r1, r0
+	adds	r3, r3, r0, lsl #16
+	adc	r1, ip, r0, lsr #16
+	do_pop	{r0, r4, r5}
+
+#else
+
+	@ The actual multiplication.
+	umull	r3, r1, r0, r1
+
+	@ Put final sign in r0.
+	and	r0, ip, #0x80000000
+
+#endif
+
+	@ Adjust result upon the MSB position.
+	cmp	r1, #(1 << 23)
+	do_it	cc, tt
+	movcc	r1, r1, lsl #1
+	orrcc	r1, r1, r3, lsr #31
+	movcc	r3, r3, lsl #1
+
+	@ Add sign to result.
+	orr	r0, r0, r1
+
+	@ Apply exponent bias, check for under/overflow.
+	sbc	r2, r2, #127
+	cmp	r2, #(254 - 1)
+	bhi	LSYM(Lml_u)
+
+	@ Round the result, merge final exponent.
+	cmp	r3, #0x80000000
+	adc	r0, r0, r2, lsl #23
+	do_it	eq
+	biceq	r0, r0, #1
+	RET
+
+	@ Multiplication by 0x1p*: let''s shortcut a lot of code.
+LSYM(Lml_1):
+	teq	r0, #0
+	and	ip, ip, #0x80000000
+	do_it	eq
+	moveq	r1, r1, lsl #9
+	orr	r0, ip, r0, lsr #9
+	orr	r0, r0, r1, lsr #9
+	subs	r2, r2, #127
+	do_it	gt, tt
+	COND(rsb,s,gt)	r3, r2, #255
+	orrgt	r0, r0, r2, lsl #23
+	RETc(gt)
+
+	@ Under/overflow: fix things up for the code below.
+	orr	r0, r0, #0x00800000
+	mov	r3, #0
+	subs	r2, r2, #1
+
+LSYM(Lml_u):
+	@ Overflow?
+	bgt	LSYM(Lml_o)
+
+	@ Check if denormalized result is possible, otherwise return signed 0.
+	cmn	r2, #(24 + 1)
+	do_it	le, t
+	bicle	r0, r0, #0x7fffffff
+	RETc(le)
+
+	@ Shift value right, round, etc.
+	rsb	r2, r2, #0
+	movs	r1, r0, lsl #1
+	shift1	lsr, r1, r1, r2
+	rsb	r2, r2, #32
+	shift1	lsl, ip, r0, r2
+	movs	r0, r1, rrx
+	adc	r0, r0, #0
+	orrs	r3, r3, ip, lsl #1
+	do_it	eq
+	biceq	r0, r0, ip, lsr #31
+	RET
+
+	@ One or both arguments are denormalized.
+	@ Scale them leftwards and preserve sign bit.
+LSYM(Lml_d):
+	teq	r2, #0
+	and	ip, r0, #0x80000000
+1:	do_it	eq, tt
+	moveq	r0, r0, lsl #1
+	tsteq	r0, #0x00800000
+	subeq	r2, r2, #1
+	beq	1b
+	orr	r0, r0, ip
+	teq	r3, #0
+	and	ip, r1, #0x80000000
+2:	do_it	eq, tt
+	moveq	r1, r1, lsl #1
+	tsteq	r1, #0x00800000
+	subeq	r3, r3, #1
+	beq	2b
+	orr	r1, r1, ip
+	b	LSYM(Lml_x)
+
+LSYM(Lml_s):
+	@ Isolate the INF and NAN cases away
+	and	r3, ip, r1, lsr #23
+	teq	r2, ip
+	do_it	ne
+	teqne	r3, ip
+	beq	1f
+
+	@ Here, one or more arguments are either denormalized or zero.
+	bics	ip, r0, #0x80000000
+	do_it	ne
+	COND(bic,s,ne)	ip, r1, #0x80000000
+	bne	LSYM(Lml_d)
+
+	@ Result is 0, but determine sign anyway.
+LSYM(Lml_z):
+	eor	r0, r0, r1
+	bic	r0, r0, #0x7fffffff
+	RET
+
+1:	@ One or both args are INF or NAN.
+	teq	r0, #0x0
+	do_it	ne, ett
+	teqne	r0, #0x80000000
+	moveq	r0, r1
+	teqne	r1, #0x0
+	teqne	r1, #0x80000000
+	beq	LSYM(Lml_n)		@ 0 * INF or INF * 0 -> NAN
+	teq	r2, ip
+	bne	1f
+	movs	r2, r0, lsl #9
+	bne	LSYM(Lml_n)		@ NAN * <anything> -> NAN
+1:	teq	r3, ip
+	bne	LSYM(Lml_i)
+	movs	r3, r1, lsl #9
+	do_it	ne
+	movne	r0, r1
+	bne	LSYM(Lml_n)		@ <anything> * NAN -> NAN
+
+	@ Result is INF, but we need to determine its sign.
+LSYM(Lml_i):
+	eor	r0, r0, r1
+
+	@ Overflow: return INF (sign already in r0).
+LSYM(Lml_o):
+	and	r0, r0, #0x80000000
+	orr	r0, r0, #0x7f000000
+	orr	r0, r0, #0x00800000
+	RET
+
+	@ Return a quiet NAN.
+LSYM(Lml_n):
+	orr	r0, r0, #0x7f000000
+	orr	r0, r0, #0x00c00000
+	RET
+
+	FUNC_END aeabi_fmul
+	FUNC_END mulsf3
+
+ARM_FUNC_START divsf3
+ARM_FUNC_ALIAS aeabi_fdiv divsf3
+
+	@ Mask out exponents, trap any zero/denormal/INF/NAN.
+	mov	ip, #0xff
+	ands	r2, ip, r0, lsr #23
+	do_it	ne, tt
+	COND(and,s,ne)	r3, ip, r1, lsr #23
+	teqne	r2, ip
+	teqne	r3, ip
+	beq	LSYM(Ldv_s)
+LSYM(Ldv_x):
+
+	@ Substract divisor exponent from dividend''s
+	sub	r2, r2, r3
+
+	@ Preserve final sign into ip.
+	eor	ip, r0, r1
+
+	@ Convert mantissa to unsigned integer.
+	@ Dividend -> r3, divisor -> r1.
+	movs	r1, r1, lsl #9
+	mov	r0, r0, lsl #9
+	beq	LSYM(Ldv_1)
+	mov	r3, #0x10000000
+	orr	r1, r3, r1, lsr #4
+	orr	r3, r3, r0, lsr #4
+
+	@ Initialize r0 (result) with final sign bit.
+	and	r0, ip, #0x80000000
+
+	@ Ensure result will land to known bit position.
+	@ Apply exponent bias accordingly.
+	cmp	r3, r1
+	do_it	cc
+	movcc	r3, r3, lsl #1
+	adc	r2, r2, #(127 - 2)
+
+	@ The actual division loop.
+	mov	ip, #0x00800000
+1:	cmp	r3, r1
+	do_it	cs, t
+	subcs	r3, r3, r1
+	orrcs	r0, r0, ip
+	cmp	r3, r1, lsr #1
+	do_it	cs, t
+	subcs	r3, r3, r1, lsr #1
+	orrcs	r0, r0, ip, lsr #1
+	cmp	r3, r1, lsr #2
+	do_it	cs, t
+	subcs	r3, r3, r1, lsr #2
+	orrcs	r0, r0, ip, lsr #2
+	cmp	r3, r1, lsr #3
+	do_it	cs, t
+	subcs	r3, r3, r1, lsr #3
+	orrcs	r0, r0, ip, lsr #3
+	movs	r3, r3, lsl #4
+	do_it	ne
+	COND(mov,s,ne)	ip, ip, lsr #4
+	bne	1b
+
+	@ Check exponent for under/overflow.
+	cmp	r2, #(254 - 1)
+	bhi	LSYM(Lml_u)
+
+	@ Round the result, merge final exponent.
+	cmp	r3, r1
+	adc	r0, r0, r2, lsl #23
+	do_it	eq
+	biceq	r0, r0, #1
+	RET
+
+	@ Division by 0x1p*: let''s shortcut a lot of code.
+LSYM(Ldv_1):
+	and	ip, ip, #0x80000000
+	orr	r0, ip, r0, lsr #9
+	adds	r2, r2, #127
+	do_it	gt, tt
+	COND(rsb,s,gt)	r3, r2, #255
+	orrgt	r0, r0, r2, lsl #23
+	RETc(gt)
+
+	orr	r0, r0, #0x00800000
+	mov	r3, #0
+	subs	r2, r2, #1
+	b	LSYM(Lml_u)
+
+	@ One or both arguments are denormalized.
+	@ Scale them leftwards and preserve sign bit.
+LSYM(Ldv_d):
+	teq	r2, #0
+	and	ip, r0, #0x80000000
+1:	do_it	eq, tt
+	moveq	r0, r0, lsl #1
+	tsteq	r0, #0x00800000
+	subeq	r2, r2, #1
+	beq	1b
+	orr	r0, r0, ip
+	teq	r3, #0
+	and	ip, r1, #0x80000000
+2:	do_it	eq, tt
+	moveq	r1, r1, lsl #1
+	tsteq	r1, #0x00800000
+	subeq	r3, r3, #1
+	beq	2b
+	orr	r1, r1, ip
+	b	LSYM(Ldv_x)
+
+	@ One or both arguments are either INF, NAN, zero or denormalized.
+LSYM(Ldv_s):
+	and	r3, ip, r1, lsr #23
+	teq	r2, ip
+	bne	1f
+	movs	r2, r0, lsl #9
+	bne	LSYM(Lml_n)		@ NAN / <anything> -> NAN
+	teq	r3, ip
+	bne	LSYM(Lml_i)		@ INF / <anything> -> INF
+	mov	r0, r1
+	b	LSYM(Lml_n)		@ INF / (INF or NAN) -> NAN
+1:	teq	r3, ip
+	bne	2f
+	movs	r3, r1, lsl #9
+	beq	LSYM(Lml_z)		@ <anything> / INF -> 0
+	mov	r0, r1
+	b	LSYM(Lml_n)		@ <anything> / NAN -> NAN
+2:	@ If both are nonzero, we need to normalize and resume above.
+	bics	ip, r0, #0x80000000
+	do_it	ne
+	COND(bic,s,ne)	ip, r1, #0x80000000
+	bne	LSYM(Ldv_d)
+	@ One or both arguments are zero.
+	bics	r2, r0, #0x80000000
+	bne	LSYM(Lml_i)		@ <non_zero> / 0 -> INF
+	bics	r3, r1, #0x80000000
+	bne	LSYM(Lml_z)		@ 0 / <non_zero> -> 0
+	b	LSYM(Lml_n)		@ 0 / 0 -> NAN
+
+	FUNC_END aeabi_fdiv
+	FUNC_END divsf3
+
+#endif /* L_muldivsf3 */
+
+#ifdef L_arm_cmpsf2
+
+	@ The return value in r0 is
+	@
+	@   0  if the operands are equal
+	@   1  if the first operand is greater than the second, or
+	@      the operands are unordered and the operation is
+	@      CMP, LT, LE, NE, or EQ.
+	@   -1 if the first operand is less than the second, or
+	@      the operands are unordered and the operation is GT
+	@      or GE.
+	@
+	@ The Z flag will be set iff the operands are equal.
+	@
+	@ The following registers are clobbered by this function:
+	@   ip, r0, r1, r2, r3
+
+ARM_FUNC_START gtsf2
+ARM_FUNC_ALIAS gesf2 gtsf2
+	mov	ip, #-1
+	b	1f
+
+ARM_FUNC_START ltsf2
+ARM_FUNC_ALIAS lesf2 ltsf2
+	mov	ip, #1
+	b	1f
+
+ARM_FUNC_START cmpsf2
+ARM_FUNC_ALIAS nesf2 cmpsf2
+ARM_FUNC_ALIAS eqsf2 cmpsf2
+	mov	ip, #1			@ how should we specify unordered here?
+
+1:	str	ip, [sp, #-4]!
+
+	@ Trap any INF/NAN first.
+	mov	r2, r0, lsl #1
+	mov	r3, r1, lsl #1
+	mvns	ip, r2, asr #24
+	do_it	ne
+	COND(mvn,s,ne)	ip, r3, asr #24
+	beq	3f
+
+	@ Compare values.
+	@ Note that 0.0 is equal to -0.0.
+2:	add	sp, sp, #4
+	orrs	ip, r2, r3, lsr #1	@ test if both are 0, clear C flag
+	do_it	ne
+	teqne	r0, r1			@ if not 0 compare sign
+	do_it	pl
+	COND(sub,s,pl)	r0, r2, r3		@ if same sign compare values, set r0
+
+	@ Result:
+	do_it	hi
+	movhi	r0, r1, asr #31
+	do_it	lo
+	mvnlo	r0, r1, asr #31
+	do_it	ne
+	orrne	r0, r0, #1
+	RET
+
+	@ Look for a NAN. 
+3:	mvns	ip, r2, asr #24
+	bne	4f
+	movs	ip, r0, lsl #9
+	bne	5f			@ r0 is NAN
+4:	mvns	ip, r3, asr #24
+	bne	2b
+	movs	ip, r1, lsl #9
+	beq	2b			@ r1 is not NAN
+5:	ldr	r0, [sp], #4		@ return unordered code.
+	RET
+
+	FUNC_END gesf2
+	FUNC_END gtsf2
+	FUNC_END lesf2
+	FUNC_END ltsf2
+	FUNC_END nesf2
+	FUNC_END eqsf2
+	FUNC_END cmpsf2
+
+ARM_FUNC_START aeabi_cfrcmple
+
+	mov	ip, r0
+	mov	r0, r1
+	mov	r1, ip
+	b	6f
+
+ARM_FUNC_START aeabi_cfcmpeq
+ARM_FUNC_ALIAS aeabi_cfcmple aeabi_cfcmpeq
+
+	@ The status-returning routines are required to preserve all
+	@ registers except ip, lr, and cpsr.
+6:	do_push	{r0, r1, r2, r3, lr}
+	ARM_CALL cmpsf2
+	@ Set the Z flag correctly, and the C flag unconditionally.
+	cmp	r0, #0
+	@ Clear the C flag if the return value was -1, indicating
+	@ that the first operand was smaller than the second.
+	do_it	mi
+	cmnmi	r0, #0
+	RETLDM	"r0, r1, r2, r3"
+
+	FUNC_END aeabi_cfcmple
+	FUNC_END aeabi_cfcmpeq
+	FUNC_END aeabi_cfrcmple
+
+ARM_FUNC_START	aeabi_fcmpeq
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cfcmple
+	do_it	eq, e
+	moveq	r0, #1	@ Equal to.
+	movne	r0, #0	@ Less than, greater than, or unordered.
+	RETLDM
+
+	FUNC_END aeabi_fcmpeq
+
+ARM_FUNC_START	aeabi_fcmplt
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cfcmple
+	do_it	cc, e
+	movcc	r0, #1	@ Less than.
+	movcs	r0, #0	@ Equal to, greater than, or unordered.
+	RETLDM
+
+	FUNC_END aeabi_fcmplt
+
+ARM_FUNC_START	aeabi_fcmple
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cfcmple
+	do_it	ls, e
+	movls	r0, #1  @ Less than or equal to.
+	movhi	r0, #0	@ Greater than or unordered.
+	RETLDM
+
+	FUNC_END aeabi_fcmple
+
+ARM_FUNC_START	aeabi_fcmpge
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cfrcmple
+	do_it	ls, e
+	movls	r0, #1	@ Operand 2 is less than or equal to operand 1.
+	movhi	r0, #0	@ Operand 2 greater than operand 1, or unordered.
+	RETLDM
+
+	FUNC_END aeabi_fcmpge
+
+ARM_FUNC_START	aeabi_fcmpgt
+
+	str	lr, [sp, #-8]!
+	ARM_CALL aeabi_cfrcmple
+	do_it	cc, e
+	movcc	r0, #1	@ Operand 2 is less than operand 1.
+	movcs	r0, #0  @ Operand 2 is greater than or equal to operand 1,
+			@ or they are unordered.
+	RETLDM
+
+	FUNC_END aeabi_fcmpgt
+
+#endif /* L_cmpsf2 */
+
+#ifdef L_arm_unordsf2
+
+ARM_FUNC_START unordsf2
+ARM_FUNC_ALIAS aeabi_fcmpun unordsf2
+
+	mov	r2, r0, lsl #1
+	mov	r3, r1, lsl #1
+	mvns	ip, r2, asr #24
+	bne	1f
+	movs	ip, r0, lsl #9
+	bne	3f			@ r0 is NAN
+1:	mvns	ip, r3, asr #24
+	bne	2f
+	movs	ip, r1, lsl #9
+	bne	3f			@ r1 is NAN
+2:	mov	r0, #0			@ arguments are ordered.
+	RET
+3:	mov	r0, #1			@ arguments are unordered.
+	RET
+
+	FUNC_END aeabi_fcmpun
+	FUNC_END unordsf2
+
+#endif /* L_unordsf2 */
+
+#ifdef L_arm_fixsfsi
+
+ARM_FUNC_START fixsfsi
+ARM_FUNC_ALIAS aeabi_f2iz fixsfsi
+
+	@ check exponent range.
+	mov	r2, r0, lsl #1
+	cmp	r2, #(127 << 24)
+	bcc	1f			@ value is too small
+	mov	r3, #(127 + 31)
+	subs	r2, r3, r2, lsr #24
+	bls	2f			@ value is too large
+
+	@ scale value
+	mov	r3, r0, lsl #8
+	orr	r3, r3, #0x80000000
+	tst	r0, #0x80000000		@ the sign bit
+	shift1	lsr, r0, r3, r2
+	do_it	ne
+	rsbne	r0, r0, #0
+	RET
+
+1:	mov	r0, #0
+	RET
+
+2:	cmp	r2, #(127 + 31 - 0xff)
+	bne	3f
+	movs	r2, r0, lsl #9
+	bne	4f			@ r0 is NAN.
+3:	ands	r0, r0, #0x80000000	@ the sign bit
+	do_it	eq
+	moveq	r0, #0x7fffffff		@ the maximum signed positive si
+	RET
+
+4:	mov	r0, #0			@ What should we convert NAN to?
+	RET
+
+	FUNC_END aeabi_f2iz
+	FUNC_END fixsfsi
+
+#endif /* L_fixsfsi */
+
+#ifdef L_arm_fixunssfsi
+
+ARM_FUNC_START fixunssfsi
+ARM_FUNC_ALIAS aeabi_f2uiz fixunssfsi
+
+	@ check exponent range.
+	movs	r2, r0, lsl #1
+	bcs	1f			@ value is negative
+	cmp	r2, #(127 << 24)
+	bcc	1f			@ value is too small
+	mov	r3, #(127 + 31)
+	subs	r2, r3, r2, lsr #24
+	bmi	2f			@ value is too large
+
+	@ scale the value
+	mov	r3, r0, lsl #8
+	orr	r3, r3, #0x80000000
+	shift1	lsr, r0, r3, r2
+	RET
+
+1:	mov	r0, #0
+	RET
+
+2:	cmp	r2, #(127 + 31 - 0xff)
+	bne	3f
+	movs	r2, r0, lsl #9
+	bne	4f			@ r0 is NAN.
+3:	mov	r0, #0xffffffff		@ maximum unsigned si
+	RET
+
+4:	mov	r0, #0			@ What should we convert NAN to?
+	RET
+
+	FUNC_END aeabi_f2uiz
+	FUNC_END fixunssfsi
+
+#endif /* L_fixunssfsi */
diff --git a/libgcc/config/arm/lib1funcs.S b/libgcc/config/arm/lib1funcs.S
new file mode 100644
index 00000000000..2e76c01df4b
--- /dev/null
+++ b/libgcc/config/arm/lib1funcs.S
@@ -0,0 +1,1829 @@
+@ libgcc routines for ARM cpu.
+@ Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
+
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005, 2007, 2008,
+   2009, 2010 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* An executable stack is *not* required for these functions.  */
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+.previous
+#endif  /* __ELF__ and __linux__ */
+
+#ifdef __ARM_EABI__
+/* Some attributes that are common to all routines in this file.  */
+	/* Tag_ABI_align_needed: This code does not require 8-byte
+	   alignment from the caller.  */
+	/* .eabi_attribute 24, 0  -- default setting.  */
+	/* Tag_ABI_align_preserved: This code preserves 8-byte
+	   alignment in any callee.  */
+	.eabi_attribute 25, 1
+#endif /* __ARM_EABI__ */
+/* ------------------------------------------------------------------------ */
+
+/* We need to know what prefix to add to function names.  */
+
+#ifndef __USER_LABEL_PREFIX__
+#error  __USER_LABEL_PREFIX__ not defined
+#endif
+
+/* ANSI concatenation macros.  */
+
+#define CONCAT1(a, b) CONCAT2(a, b)
+#define CONCAT2(a, b) a ## b
+
+/* Use the right prefix for global labels.  */
+
+#define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
+
+#ifdef __ELF__
+#ifdef __thumb__
+#define __PLT__  /* Not supported in Thumb assembler (for now).  */
+#elif defined __vxworks && !defined __PIC__
+#define __PLT__ /* Not supported by the kernel loader.  */
+#else
+#define __PLT__ (PLT)
+#endif
+#define TYPE(x) .type SYM(x),function
+#define SIZE(x) .size SYM(x), . - SYM(x)
+#define LSYM(x) .x
+#else
+#define __PLT__
+#define TYPE(x)
+#define SIZE(x)
+#define LSYM(x) x
+#endif
+
+/* Function end macros.  Variants for interworking.  */
+
+#if defined(__ARM_ARCH_2__)
+# define __ARM_ARCH__ 2
+#endif
+
+#if defined(__ARM_ARCH_3__)
+# define __ARM_ARCH__ 3
+#endif
+
+#if defined(__ARM_ARCH_3M__) || defined(__ARM_ARCH_4__) \
+	|| defined(__ARM_ARCH_4T__)
+/* We use __ARM_ARCH__ set to 4 here, but in reality it's any processor with
+   long multiply instructions.  That includes v3M.  */
+# define __ARM_ARCH__ 4
+#endif
+	
+#if defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) \
+	|| defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
+	|| defined(__ARM_ARCH_5TEJ__)
+# define __ARM_ARCH__ 5
+#endif
+
+#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+	|| defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
+	|| defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) \
+	|| defined(__ARM_ARCH_6M__)
+# define __ARM_ARCH__ 6
+#endif
+
+#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+	|| defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+	|| defined(__ARM_ARCH_7EM__)
+# define __ARM_ARCH__ 7
+#endif
+
+#ifndef __ARM_ARCH__
+#error Unable to determine architecture.
+#endif
+
+/* There are times when we might prefer Thumb1 code even if ARM code is
+   permitted, for example, the code might be smaller, or there might be
+   interworking problems with switching to ARM state if interworking is
+   disabled.  */
+#if (defined(__thumb__)			\
+     && !defined(__thumb2__)		\
+     && (!defined(__THUMB_INTERWORK__)	\
+	 || defined (__OPTIMIZE_SIZE__)	\
+	 || defined(__ARM_ARCH_6M__)))
+# define __prefer_thumb__
+#endif
+
+/* How to return from a function call depends on the architecture variant.  */
+
+#if (__ARM_ARCH__ > 4) || defined(__ARM_ARCH_4T__)
+
+# define RET		bx	lr
+# define RETc(x)	bx##x	lr
+
+/* Special precautions for interworking on armv4t.  */
+# if (__ARM_ARCH__ == 4)
+
+/* Always use bx, not ldr pc.  */
+#  if (defined(__thumb__) || defined(__THUMB_INTERWORK__))
+#    define __INTERWORKING__
+#   endif /* __THUMB__ || __THUMB_INTERWORK__ */
+
+/* Include thumb stub before arm mode code.  */
+#  if defined(__thumb__) && !defined(__THUMB_INTERWORK__)
+#   define __INTERWORKING_STUBS__
+#  endif /* __thumb__ && !__THUMB_INTERWORK__ */
+
+#endif /* __ARM_ARCH == 4 */
+
+#else
+
+# define RET		mov	pc, lr
+# define RETc(x)	mov##x	pc, lr
+
+#endif
+
+.macro	cfi_pop		advance, reg, cfa_offset
+#ifdef __ELF__
+	.pushsection	.debug_frame
+	.byte	0x4		/* DW_CFA_advance_loc4 */
+	.4byte	\advance
+	.byte	(0xc0 | \reg)	/* DW_CFA_restore */
+	.byte	0xe		/* DW_CFA_def_cfa_offset */
+	.uleb128 \cfa_offset
+	.popsection
+#endif
+.endm
+.macro	cfi_push	advance, reg, offset, cfa_offset
+#ifdef __ELF__
+	.pushsection	.debug_frame
+	.byte	0x4		/* DW_CFA_advance_loc4 */
+	.4byte	\advance
+	.byte	(0x80 | \reg)	/* DW_CFA_offset */
+	.uleb128 (\offset / -4)
+	.byte	0xe		/* DW_CFA_def_cfa_offset */
+	.uleb128 \cfa_offset
+	.popsection
+#endif
+.endm
+.macro cfi_start	start_label, end_label
+#ifdef __ELF__
+	.pushsection	.debug_frame
+LSYM(Lstart_frame):
+	.4byte	LSYM(Lend_cie) - LSYM(Lstart_cie) @ Length of CIE
+LSYM(Lstart_cie):
+        .4byte	0xffffffff	@ CIE Identifier Tag
+        .byte	0x1	@ CIE Version
+        .ascii	"\0"	@ CIE Augmentation
+        .uleb128 0x1	@ CIE Code Alignment Factor
+        .sleb128 -4	@ CIE Data Alignment Factor
+        .byte	0xe	@ CIE RA Column
+        .byte	0xc	@ DW_CFA_def_cfa
+        .uleb128 0xd
+        .uleb128 0x0
+
+	.align 2
+LSYM(Lend_cie):
+	.4byte	LSYM(Lend_fde)-LSYM(Lstart_fde)	@ FDE Length
+LSYM(Lstart_fde):
+	.4byte	LSYM(Lstart_frame)	@ FDE CIE offset
+	.4byte	\start_label	@ FDE initial location
+	.4byte	\end_label-\start_label	@ FDE address range
+	.popsection
+#endif
+.endm
+.macro cfi_end	end_label
+#ifdef __ELF__
+	.pushsection	.debug_frame
+	.align	2
+LSYM(Lend_fde):
+	.popsection
+\end_label:
+#endif
+.endm
+
+/* Don't pass dirn, it's there just to get token pasting right.  */
+
+.macro	RETLDM	regs=, cond=, unwind=, dirn=ia
+#if defined (__INTERWORKING__)
+	.ifc "\regs",""
+	ldr\cond	lr, [sp], #8
+	.else
+# if defined(__thumb2__)
+	pop\cond	{\regs, lr}
+# else
+	ldm\cond\dirn	sp!, {\regs, lr}
+# endif
+	.endif
+	.ifnc "\unwind", ""
+	/* Mark LR as restored.  */
+97:	cfi_pop 97b - \unwind, 0xe, 0x0
+	.endif
+	bx\cond	lr
+#else
+	/* Caller is responsible for providing IT instruction.  */
+	.ifc "\regs",""
+	ldr\cond	pc, [sp], #8
+	.else
+# if defined(__thumb2__)
+	pop\cond	{\regs, pc}
+# else
+	ldm\cond\dirn	sp!, {\regs, pc}
+# endif
+	.endif
+#endif
+.endm
+
+/* The Unified assembly syntax allows the same code to be assembled for both
+   ARM and Thumb-2.  However this is only supported by recent gas, so define
+   a set of macros to allow ARM code on older assemblers.  */
+#if defined(__thumb2__)
+.macro do_it cond, suffix=""
+	it\suffix	\cond
+.endm
+.macro shift1 op, arg0, arg1, arg2
+	\op	\arg0, \arg1, \arg2
+.endm
+#define do_push	push
+#define do_pop	pop
+#define COND(op1, op2, cond) op1 ## op2 ## cond
+/* Perform an arithmetic operation with a variable shift operand.  This
+   requires two instructions and a scratch register on Thumb-2.  */
+.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp
+	\shiftop \tmp, \src2, \shiftreg
+	\name \dest, \src1, \tmp
+.endm
+#else
+.macro do_it cond, suffix=""
+.endm
+.macro shift1 op, arg0, arg1, arg2
+	mov	\arg0, \arg1, \op \arg2
+.endm
+#define do_push	stmfd sp!,
+#define do_pop	ldmfd sp!,
+#define COND(op1, op2, cond) op1 ## cond ## op2
+.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp
+	\name \dest, \src1, \src2, \shiftop \shiftreg
+.endm
+#endif
+
+#ifdef __ARM_EABI__
+.macro ARM_LDIV0 name signed
+	cmp	r0, #0
+	.ifc	\signed, unsigned
+	movne	r0, #0xffffffff
+	.else
+	movgt	r0, #0x7fffffff
+	movlt	r0, #0x80000000
+	.endif
+	b	SYM (__aeabi_idiv0) __PLT__
+.endm
+#else
+.macro ARM_LDIV0 name signed
+	str	lr, [sp, #-8]!
+98:	cfi_push 98b - __\name, 0xe, -0x8, 0x8
+	bl	SYM (__div0) __PLT__
+	mov	r0, #0			@ About as wrong as it could be.
+	RETLDM	unwind=98b
+.endm
+#endif
+
+
+#ifdef __ARM_EABI__
+.macro THUMB_LDIV0 name signed
+#if defined(__ARM_ARCH_6M__)
+	.ifc \signed, unsigned
+	cmp	r0, #0
+	beq	1f
+	mov	r0, #0
+	mvn	r0, r0		@ 0xffffffff
+1:
+	.else
+	cmp	r0, #0
+	beq	2f
+	blt	3f
+	mov	r0, #0
+	mvn	r0, r0
+	lsr	r0, r0, #1	@ 0x7fffffff
+	b	2f
+3:	mov	r0, #0x80
+	lsl	r0, r0, #24	@ 0x80000000
+2:
+	.endif
+	push	{r0, r1, r2}
+	ldr	r0, 4f
+	adr	r1, 4f
+	add	r0, r1
+	str	r0, [sp, #8]
+	@ We know we are not on armv4t, so pop pc is safe.
+	pop	{r0, r1, pc}
+	.align	2
+4:
+	.word	__aeabi_idiv0 - 4b
+#elif defined(__thumb2__)
+	.syntax unified
+	.ifc \signed, unsigned
+	cbz	r0, 1f
+	mov	r0, #0xffffffff
+1:
+	.else
+	cmp	r0, #0
+	do_it	gt
+	movgt	r0, #0x7fffffff
+	do_it	lt
+	movlt	r0, #0x80000000
+	.endif
+	b.w	SYM(__aeabi_idiv0) __PLT__
+#else
+	.align	2
+	bx	pc
+	nop
+	.arm
+	cmp	r0, #0
+	.ifc	\signed, unsigned
+	movne	r0, #0xffffffff
+	.else
+	movgt	r0, #0x7fffffff
+	movlt	r0, #0x80000000
+	.endif
+	b	SYM(__aeabi_idiv0) __PLT__
+	.thumb
+#endif
+.endm
+#else
+.macro THUMB_LDIV0 name signed
+	push	{ r1, lr }
+98:	cfi_push 98b - __\name, 0xe, -0x4, 0x8
+	bl	SYM (__div0)
+	mov	r0, #0			@ About as wrong as it could be.
+#if defined (__INTERWORKING__)
+	pop	{ r1, r2 }
+	bx	r2
+#else
+	pop	{ r1, pc }
+#endif
+.endm
+#endif
+
+.macro FUNC_END name
+	SIZE (__\name)
+.endm
+
+.macro DIV_FUNC_END name signed
+	cfi_start	__\name, LSYM(Lend_div0)
+LSYM(Ldiv0):
+#ifdef __thumb__
+	THUMB_LDIV0 \name \signed
+#else
+	ARM_LDIV0 \name \signed
+#endif
+	cfi_end	LSYM(Lend_div0)
+	FUNC_END \name
+.endm
+
+.macro THUMB_FUNC_START name
+	.globl	SYM (\name)
+	TYPE	(\name)
+	.thumb_func
+SYM (\name):
+.endm
+
+/* Function start macros.  Variants for ARM and Thumb.  */
+
+#ifdef __thumb__
+#define THUMB_FUNC .thumb_func
+#define THUMB_CODE .force_thumb
+# if defined(__thumb2__)
+#define THUMB_SYNTAX .syntax divided
+# else
+#define THUMB_SYNTAX
+# endif
+#else
+#define THUMB_FUNC
+#define THUMB_CODE
+#define THUMB_SYNTAX
+#endif
+
+.macro FUNC_START name
+	.text
+	.globl SYM (__\name)
+	TYPE (__\name)
+	.align 0
+	THUMB_CODE
+	THUMB_FUNC
+	THUMB_SYNTAX
+SYM (__\name):
+.endm
+
+/* Special function that will always be coded in ARM assembly, even if
+   in Thumb-only compilation.  */
+
+#if defined(__thumb2__)
+
+/* For Thumb-2 we build everything in thumb mode.  */
+.macro ARM_FUNC_START name
+       FUNC_START \name
+       .syntax unified
+.endm
+#define EQUIV .thumb_set
+.macro  ARM_CALL name
+	bl	__\name
+.endm
+
+#elif defined(__INTERWORKING_STUBS__)
+
+.macro	ARM_FUNC_START name
+	FUNC_START \name
+	bx	pc
+	nop
+	.arm
+/* A hook to tell gdb that we've switched to ARM mode.  Also used to call
+   directly from other local arm routines.  */
+_L__\name:		
+.endm
+#define EQUIV .thumb_set
+/* Branch directly to a function declared with ARM_FUNC_START.
+   Must be called in arm mode.  */
+.macro  ARM_CALL name
+	bl	_L__\name
+.endm
+
+#else /* !(__INTERWORKING_STUBS__ || __thumb2__) */
+
+#ifdef __ARM_ARCH_6M__
+#define EQUIV .thumb_set
+#else
+.macro	ARM_FUNC_START name
+	.text
+	.globl SYM (__\name)
+	TYPE (__\name)
+	.align 0
+	.arm
+SYM (__\name):
+.endm
+#define EQUIV .set
+.macro  ARM_CALL name
+	bl	__\name
+.endm
+#endif
+
+#endif
+
+.macro	FUNC_ALIAS new old
+	.globl	SYM (__\new)
+#if defined (__thumb__)
+	.thumb_set	SYM (__\new), SYM (__\old)
+#else
+	.set	SYM (__\new), SYM (__\old)
+#endif
+.endm
+
+#ifndef __ARM_ARCH_6M__
+.macro	ARM_FUNC_ALIAS new old
+	.globl	SYM (__\new)
+	EQUIV	SYM (__\new), SYM (__\old)
+#if defined(__INTERWORKING_STUBS__)
+	.set	SYM (_L__\new), SYM (_L__\old)
+#endif
+.endm
+#endif
+
+#ifdef __ARMEB__
+#define xxh r0
+#define xxl r1
+#define yyh r2
+#define yyl r3
+#else
+#define xxh r1
+#define xxl r0
+#define yyh r3
+#define yyl r2
+#endif	
+
+#ifdef __ARM_EABI__
+.macro	WEAK name
+	.weak SYM (__\name)
+.endm
+#endif
+
+#ifdef __thumb__
+/* Register aliases.  */
+
+work		.req	r4	@ XXXX is this safe ?
+dividend	.req	r0
+divisor		.req	r1
+overdone	.req	r2
+result		.req	r2
+curbit		.req	r3
+#endif
+#if 0
+ip		.req	r12
+sp		.req	r13
+lr		.req	r14
+pc		.req	r15
+#endif
+
+/* ------------------------------------------------------------------------ */
+/*		Bodies of the division and modulo routines.		    */
+/* ------------------------------------------------------------------------ */	
+.macro ARM_DIV_BODY dividend, divisor, result, curbit
+
+#if __ARM_ARCH__ >= 5 && ! defined (__OPTIMIZE_SIZE__)
+
+#if defined (__thumb2__)
+	clz	\curbit, \dividend
+	clz	\result, \divisor
+	sub	\curbit, \result, \curbit
+	rsb	\curbit, \curbit, #31
+	adr	\result, 1f
+	add	\curbit, \result, \curbit, lsl #4
+	mov	\result, #0
+	mov	pc, \curbit
+.p2align 3
+1:
+	.set	shift, 32
+	.rept	32
+	.set	shift, shift - 1
+	cmp.w	\dividend, \divisor, lsl #shift
+	nop.n
+	adc.w	\result, \result, \result
+	it	cs
+	subcs.w	\dividend, \dividend, \divisor, lsl #shift
+	.endr
+#else
+	clz	\curbit, \dividend
+	clz	\result, \divisor
+	sub	\curbit, \result, \curbit
+	rsbs	\curbit, \curbit, #31
+	addne	\curbit, \curbit, \curbit, lsl #1
+	mov	\result, #0
+	addne	pc, pc, \curbit, lsl #2
+	nop
+	.set	shift, 32
+	.rept	32
+	.set	shift, shift - 1
+	cmp	\dividend, \divisor, lsl #shift
+	adc	\result, \result, \result
+	subcs	\dividend, \dividend, \divisor, lsl #shift
+	.endr
+#endif
+
+#else /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */
+#if __ARM_ARCH__ >= 5
+
+	clz	\curbit, \divisor
+	clz	\result, \dividend
+	sub	\result, \curbit, \result
+	mov	\curbit, #1
+	mov	\divisor, \divisor, lsl \result
+	mov	\curbit, \curbit, lsl \result
+	mov	\result, #0
+	
+#else /* __ARM_ARCH__ < 5 */
+
+	@ Initially shift the divisor left 3 bits if possible,
+	@ set curbit accordingly.  This allows for curbit to be located
+	@ at the left end of each 4-bit nibbles in the division loop
+	@ to save one loop in most cases.
+	tst	\divisor, #0xe0000000
+	moveq	\divisor, \divisor, lsl #3
+	moveq	\curbit, #8
+	movne	\curbit, #1
+
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+1:	cmp	\divisor, #0x10000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #4
+	movlo	\curbit, \curbit, lsl #4
+	blo	1b
+
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+1:	cmp	\divisor, #0x80000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #1
+	movlo	\curbit, \curbit, lsl #1
+	blo	1b
+
+	mov	\result, #0
+
+#endif /* __ARM_ARCH__ < 5 */
+
+	@ Division loop
+1:	cmp	\dividend, \divisor
+	do_it	hs, t
+	subhs	\dividend, \dividend, \divisor
+	orrhs	\result,   \result,   \curbit
+	cmp	\dividend, \divisor,  lsr #1
+	do_it	hs, t
+	subhs	\dividend, \dividend, \divisor, lsr #1
+	orrhs	\result,   \result,   \curbit,  lsr #1
+	cmp	\dividend, \divisor,  lsr #2
+	do_it	hs, t
+	subhs	\dividend, \dividend, \divisor, lsr #2
+	orrhs	\result,   \result,   \curbit,  lsr #2
+	cmp	\dividend, \divisor,  lsr #3
+	do_it	hs, t
+	subhs	\dividend, \dividend, \divisor, lsr #3
+	orrhs	\result,   \result,   \curbit,  lsr #3
+	cmp	\dividend, #0			@ Early termination?
+	do_it	ne, t
+	movnes	\curbit,   \curbit,  lsr #4	@ No, any more bits to do?
+	movne	\divisor,  \divisor, lsr #4
+	bne	1b
+
+#endif /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */
+
+.endm
+/* ------------------------------------------------------------------------ */	
+.macro ARM_DIV2_ORDER divisor, order
+
+#if __ARM_ARCH__ >= 5
+
+	clz	\order, \divisor
+	rsb	\order, \order, #31
+
+#else
+
+	cmp	\divisor, #(1 << 16)
+	movhs	\divisor, \divisor, lsr #16
+	movhs	\order, #16
+	movlo	\order, #0
+
+	cmp	\divisor, #(1 << 8)
+	movhs	\divisor, \divisor, lsr #8
+	addhs	\order, \order, #8
+
+	cmp	\divisor, #(1 << 4)
+	movhs	\divisor, \divisor, lsr #4
+	addhs	\order, \order, #4
+
+	cmp	\divisor, #(1 << 2)
+	addhi	\order, \order, #3
+	addls	\order, \order, \divisor, lsr #1
+
+#endif
+
+.endm
+/* ------------------------------------------------------------------------ */
+.macro ARM_MOD_BODY dividend, divisor, order, spare
+
+#if __ARM_ARCH__ >= 5 && ! defined (__OPTIMIZE_SIZE__)
+
+	clz	\order, \divisor
+	clz	\spare, \dividend
+	sub	\order, \order, \spare
+	rsbs	\order, \order, #31
+	addne	pc, pc, \order, lsl #3
+	nop
+	.set	shift, 32
+	.rept	32
+	.set	shift, shift - 1
+	cmp	\dividend, \divisor, lsl #shift
+	subcs	\dividend, \dividend, \divisor, lsl #shift
+	.endr
+
+#else /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */
+#if __ARM_ARCH__ >= 5
+
+	clz	\order, \divisor
+	clz	\spare, \dividend
+	sub	\order, \order, \spare
+	mov	\divisor, \divisor, lsl \order
+	
+#else /* __ARM_ARCH__ < 5 */
+
+	mov	\order, #0
+
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+1:	cmp	\divisor, #0x10000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #4
+	addlo	\order, \order, #4
+	blo	1b
+
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+1:	cmp	\divisor, #0x80000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #1
+	addlo	\order, \order, #1
+	blo	1b
+
+#endif /* __ARM_ARCH__ < 5 */
+
+	@ Perform all needed substractions to keep only the reminder.
+	@ Do comparisons in batch of 4 first.
+	subs	\order, \order, #3		@ yes, 3 is intended here
+	blt	2f
+
+1:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	cmp	\dividend, \divisor,  lsr #1
+	subhs	\dividend, \dividend, \divisor, lsr #1
+	cmp	\dividend, \divisor,  lsr #2
+	subhs	\dividend, \dividend, \divisor, lsr #2
+	cmp	\dividend, \divisor,  lsr #3
+	subhs	\dividend, \dividend, \divisor, lsr #3
+	cmp	\dividend, #1
+	mov	\divisor, \divisor, lsr #4
+	subges	\order, \order, #4
+	bge	1b
+
+	tst	\order, #3
+	teqne	\dividend, #0
+	beq	5f
+
+	@ Either 1, 2 or 3 comparison/substractions are left.
+2:	cmn	\order, #2
+	blt	4f
+	beq	3f
+	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	mov	\divisor,  \divisor,  lsr #1
+3:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	mov	\divisor,  \divisor,  lsr #1
+4:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+5:
+
+#endif /* __ARM_ARCH__ < 5 || defined (__OPTIMIZE_SIZE__) */
+
+.endm
+/* ------------------------------------------------------------------------ */
+.macro THUMB_DIV_MOD_BODY modulo
+	@ Load the constant 0x10000000 into our work register.
+	mov	work, #1
+	lsl	work, #28
+LSYM(Loop1):
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+	cmp	divisor, work
+	bhs	LSYM(Lbignum)
+	cmp	divisor, dividend
+	bhs	LSYM(Lbignum)
+	lsl	divisor, #4
+	lsl	curbit,  #4
+	b	LSYM(Loop1)
+LSYM(Lbignum):
+	@ Set work to 0x80000000
+	lsl	work, #3
+LSYM(Loop2):
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+	cmp	divisor, work
+	bhs	LSYM(Loop3)
+	cmp	divisor, dividend
+	bhs	LSYM(Loop3)
+	lsl	divisor, #1
+	lsl	curbit,  #1
+	b	LSYM(Loop2)
+LSYM(Loop3):
+	@ Test for possible subtractions ...
+  .if \modulo
+	@ ... On the final pass, this may subtract too much from the dividend, 
+	@ so keep track of which subtractions are done, we can fix them up 
+	@ afterwards.
+	mov	overdone, #0
+	cmp	dividend, divisor
+	blo	LSYM(Lover1)
+	sub	dividend, dividend, divisor
+LSYM(Lover1):
+	lsr	work, divisor, #1
+	cmp	dividend, work
+	blo	LSYM(Lover2)
+	sub	dividend, dividend, work
+	mov	ip, curbit
+	mov	work, #1
+	ror	curbit, work
+	orr	overdone, curbit
+	mov	curbit, ip
+LSYM(Lover2):
+	lsr	work, divisor, #2
+	cmp	dividend, work
+	blo	LSYM(Lover3)
+	sub	dividend, dividend, work
+	mov	ip, curbit
+	mov	work, #2
+	ror	curbit, work
+	orr	overdone, curbit
+	mov	curbit, ip
+LSYM(Lover3):
+	lsr	work, divisor, #3
+	cmp	dividend, work
+	blo	LSYM(Lover4)
+	sub	dividend, dividend, work
+	mov	ip, curbit
+	mov	work, #3
+	ror	curbit, work
+	orr	overdone, curbit
+	mov	curbit, ip
+LSYM(Lover4):
+	mov	ip, curbit
+  .else
+	@ ... and note which bits are done in the result.  On the final pass,
+	@ this may subtract too much from the dividend, but the result will be ok,
+	@ since the "bit" will have been shifted out at the bottom.
+	cmp	dividend, divisor
+	blo	LSYM(Lover1)
+	sub	dividend, dividend, divisor
+	orr	result, result, curbit
+LSYM(Lover1):
+	lsr	work, divisor, #1
+	cmp	dividend, work
+	blo	LSYM(Lover2)
+	sub	dividend, dividend, work
+	lsr	work, curbit, #1
+	orr	result, work
+LSYM(Lover2):
+	lsr	work, divisor, #2
+	cmp	dividend, work
+	blo	LSYM(Lover3)
+	sub	dividend, dividend, work
+	lsr	work, curbit, #2
+	orr	result, work
+LSYM(Lover3):
+	lsr	work, divisor, #3
+	cmp	dividend, work
+	blo	LSYM(Lover4)
+	sub	dividend, dividend, work
+	lsr	work, curbit, #3
+	orr	result, work
+LSYM(Lover4):
+  .endif
+	
+	cmp	dividend, #0			@ Early termination?
+	beq	LSYM(Lover5)
+	lsr	curbit,  #4			@ No, any more bits to do?
+	beq	LSYM(Lover5)
+	lsr	divisor, #4
+	b	LSYM(Loop3)
+LSYM(Lover5):
+  .if \modulo
+	@ Any subtractions that we should not have done will be recorded in
+	@ the top three bits of "overdone".  Exactly which were not needed
+	@ are governed by the position of the bit, stored in ip.
+	mov	work, #0xe
+	lsl	work, #28
+	and	overdone, work
+	beq	LSYM(Lgot_result)
+	
+	@ If we terminated early, because dividend became zero, then the 
+	@ bit in ip will not be in the bottom nibble, and we should not
+	@ perform the additions below.  We must test for this though
+	@ (rather relying upon the TSTs to prevent the additions) since
+	@ the bit in ip could be in the top two bits which might then match
+	@ with one of the smaller RORs.
+	mov	curbit, ip
+	mov	work, #0x7
+	tst	curbit, work
+	beq	LSYM(Lgot_result)
+	
+	mov	curbit, ip
+	mov	work, #3
+	ror	curbit, work
+	tst	overdone, curbit
+	beq	LSYM(Lover6)
+	lsr	work, divisor, #3
+	add	dividend, work
+LSYM(Lover6):
+	mov	curbit, ip
+	mov	work, #2
+	ror	curbit, work
+	tst	overdone, curbit
+	beq	LSYM(Lover7)
+	lsr	work, divisor, #2
+	add	dividend, work
+LSYM(Lover7):
+	mov	curbit, ip
+	mov	work, #1
+	ror	curbit, work
+	tst	overdone, curbit
+	beq	LSYM(Lgot_result)
+	lsr	work, divisor, #1
+	add	dividend, work
+  .endif
+LSYM(Lgot_result):
+.endm	
+/* ------------------------------------------------------------------------ */
+/*		Start of the Real Functions				    */
+/* ------------------------------------------------------------------------ */
+#ifdef L_udivsi3
+
+#if defined(__prefer_thumb__)
+
+	FUNC_START udivsi3
+	FUNC_ALIAS aeabi_uidiv udivsi3
+
+	cmp	divisor, #0
+	beq	LSYM(Ldiv0)
+LSYM(udivsi3_skip_div0_test):
+	mov	curbit, #1
+	mov	result, #0
+	
+	push	{ work }
+	cmp	dividend, divisor
+	blo	LSYM(Lgot_result)
+
+	THUMB_DIV_MOD_BODY 0
+	
+	mov	r0, result
+	pop	{ work }
+	RET
+
+#else /* ARM version/Thumb-2.  */
+
+	ARM_FUNC_START udivsi3
+	ARM_FUNC_ALIAS aeabi_uidiv udivsi3
+
+	/* Note: if called via udivsi3_skip_div0_test, this will unnecessarily
+	   check for division-by-zero a second time.  */
+LSYM(udivsi3_skip_div0_test):
+	subs	r2, r1, #1
+	do_it	eq
+	RETc(eq)
+	bcc	LSYM(Ldiv0)
+	cmp	r0, r1
+	bls	11f
+	tst	r1, r2
+	beq	12f
+	
+	ARM_DIV_BODY r0, r1, r2, r3
+	
+	mov	r0, r2
+	RET	
+
+11:	do_it	eq, e
+	moveq	r0, #1
+	movne	r0, #0
+	RET
+
+12:	ARM_DIV2_ORDER r1, r2
+
+	mov	r0, r0, lsr r2
+	RET
+
+#endif /* ARM version */
+
+	DIV_FUNC_END udivsi3 unsigned
+
+#if defined(__prefer_thumb__)
+FUNC_START aeabi_uidivmod
+	cmp	r1, #0
+	beq	LSYM(Ldiv0)
+	push	{r0, r1, lr}
+	bl	LSYM(udivsi3_skip_div0_test)
+	POP	{r1, r2, r3}
+	mul	r2, r0
+	sub	r1, r1, r2
+	bx	r3
+#else
+ARM_FUNC_START aeabi_uidivmod
+	cmp	r1, #0
+	beq	LSYM(Ldiv0)
+	stmfd	sp!, { r0, r1, lr }
+	bl	LSYM(udivsi3_skip_div0_test)
+	ldmfd	sp!, { r1, r2, lr }
+	mul	r3, r2, r0
+	sub	r1, r1, r3
+	RET
+#endif
+	FUNC_END aeabi_uidivmod
+	
+#endif /* L_udivsi3 */
+/* ------------------------------------------------------------------------ */
+#ifdef L_umodsi3
+
+	FUNC_START umodsi3
+
+#ifdef __thumb__
+
+	cmp	divisor, #0
+	beq	LSYM(Ldiv0)
+	mov	curbit, #1
+	cmp	dividend, divisor
+	bhs	LSYM(Lover10)
+	RET	
+
+LSYM(Lover10):
+	push	{ work }
+
+	THUMB_DIV_MOD_BODY 1
+	
+	pop	{ work }
+	RET
+	
+#else  /* ARM version.  */
+	
+	subs	r2, r1, #1			@ compare divisor with 1
+	bcc	LSYM(Ldiv0)
+	cmpne	r0, r1				@ compare dividend with divisor
+	moveq   r0, #0
+	tsthi	r1, r2				@ see if divisor is power of 2
+	andeq	r0, r0, r2
+	RETc(ls)
+
+	ARM_MOD_BODY r0, r1, r2, r3
+	
+	RET	
+
+#endif /* ARM version.  */
+	
+	DIV_FUNC_END umodsi3 unsigned
+
+#endif /* L_umodsi3 */
+/* ------------------------------------------------------------------------ */
+#ifdef L_divsi3
+
+#if defined(__prefer_thumb__)
+
+	FUNC_START divsi3	
+	FUNC_ALIAS aeabi_idiv divsi3
+
+	cmp	divisor, #0
+	beq	LSYM(Ldiv0)
+LSYM(divsi3_skip_div0_test):
+	push	{ work }
+	mov	work, dividend
+	eor	work, divisor		@ Save the sign of the result.
+	mov	ip, work
+	mov	curbit, #1
+	mov	result, #0
+	cmp	divisor, #0
+	bpl	LSYM(Lover10)
+	neg	divisor, divisor	@ Loops below use unsigned.
+LSYM(Lover10):
+	cmp	dividend, #0
+	bpl	LSYM(Lover11)
+	neg	dividend, dividend
+LSYM(Lover11):
+	cmp	dividend, divisor
+	blo	LSYM(Lgot_result)
+
+	THUMB_DIV_MOD_BODY 0
+	
+	mov	r0, result
+	mov	work, ip
+	cmp	work, #0
+	bpl	LSYM(Lover12)
+	neg	r0, r0
+LSYM(Lover12):
+	pop	{ work }
+	RET
+
+#else /* ARM/Thumb-2 version.  */
+	
+	ARM_FUNC_START divsi3	
+	ARM_FUNC_ALIAS aeabi_idiv divsi3
+
+	cmp	r1, #0
+	beq	LSYM(Ldiv0)
+LSYM(divsi3_skip_div0_test):
+	eor	ip, r0, r1			@ save the sign of the result.
+	do_it	mi
+	rsbmi	r1, r1, #0			@ loops below use unsigned.
+	subs	r2, r1, #1			@ division by 1 or -1 ?
+	beq	10f
+	movs	r3, r0
+	do_it	mi
+	rsbmi	r3, r0, #0			@ positive dividend value
+	cmp	r3, r1
+	bls	11f
+	tst	r1, r2				@ divisor is power of 2 ?
+	beq	12f
+
+	ARM_DIV_BODY r3, r1, r0, r2
+	
+	cmp	ip, #0
+	do_it	mi
+	rsbmi	r0, r0, #0
+	RET	
+
+10:	teq	ip, r0				@ same sign ?
+	do_it	mi
+	rsbmi	r0, r0, #0
+	RET	
+
+11:	do_it	lo
+	movlo	r0, #0
+	do_it	eq,t
+	moveq	r0, ip, asr #31
+	orreq	r0, r0, #1
+	RET
+
+12:	ARM_DIV2_ORDER r1, r2
+
+	cmp	ip, #0
+	mov	r0, r3, lsr r2
+	do_it	mi
+	rsbmi	r0, r0, #0
+	RET
+
+#endif /* ARM version */
+	
+	DIV_FUNC_END divsi3 signed
+
+#if defined(__prefer_thumb__)
+FUNC_START aeabi_idivmod
+	cmp	r1, #0
+	beq	LSYM(Ldiv0)
+	push	{r0, r1, lr}
+	bl	LSYM(divsi3_skip_div0_test)
+	POP	{r1, r2, r3}
+	mul	r2, r0
+	sub	r1, r1, r2
+	bx	r3
+#else
+ARM_FUNC_START aeabi_idivmod
+	cmp	r1, #0
+	beq	LSYM(Ldiv0)
+	stmfd	sp!, { r0, r1, lr }
+	bl	LSYM(divsi3_skip_div0_test)
+	ldmfd	sp!, { r1, r2, lr }
+	mul	r3, r2, r0
+	sub	r1, r1, r3
+	RET
+#endif
+	FUNC_END aeabi_idivmod
+	
+#endif /* L_divsi3 */
+/* ------------------------------------------------------------------------ */
+#ifdef L_modsi3
+
+	FUNC_START modsi3
+
+#ifdef __thumb__
+
+	mov	curbit, #1
+	cmp	divisor, #0
+	beq	LSYM(Ldiv0)
+	bpl	LSYM(Lover10)
+	neg	divisor, divisor		@ Loops below use unsigned.
+LSYM(Lover10):
+	push	{ work }
+	@ Need to save the sign of the dividend, unfortunately, we need
+	@ work later on.  Must do this after saving the original value of
+	@ the work register, because we will pop this value off first.
+	push	{ dividend }
+	cmp	dividend, #0
+	bpl	LSYM(Lover11)
+	neg	dividend, dividend
+LSYM(Lover11):
+	cmp	dividend, divisor
+	blo	LSYM(Lgot_result)
+
+	THUMB_DIV_MOD_BODY 1
+		
+	pop	{ work }
+	cmp	work, #0
+	bpl	LSYM(Lover12)
+	neg	dividend, dividend
+LSYM(Lover12):
+	pop	{ work }
+	RET	
+
+#else /* ARM version.  */
+	
+	cmp	r1, #0
+	beq	LSYM(Ldiv0)
+	rsbmi	r1, r1, #0			@ loops below use unsigned.
+	movs	ip, r0				@ preserve sign of dividend
+	rsbmi	r0, r0, #0			@ if negative make positive
+	subs	r2, r1, #1			@ compare divisor with 1
+	cmpne	r0, r1				@ compare dividend with divisor
+	moveq	r0, #0
+	tsthi	r1, r2				@ see if divisor is power of 2
+	andeq	r0, r0, r2
+	bls	10f
+
+	ARM_MOD_BODY r0, r1, r2, r3
+
+10:	cmp	ip, #0
+	rsbmi	r0, r0, #0
+	RET	
+
+#endif /* ARM version */
+	
+	DIV_FUNC_END modsi3 signed
+
+#endif /* L_modsi3 */
+/* ------------------------------------------------------------------------ */
+#ifdef L_dvmd_tls
+
+#ifdef __ARM_EABI__
+	WEAK aeabi_idiv0
+	WEAK aeabi_ldiv0
+	FUNC_START aeabi_idiv0
+	FUNC_START aeabi_ldiv0
+	RET
+	FUNC_END aeabi_ldiv0
+	FUNC_END aeabi_idiv0
+#else
+	FUNC_START div0
+	RET
+	FUNC_END div0
+#endif
+	
+#endif /* L_divmodsi_tools */
+/* ------------------------------------------------------------------------ */
+#ifdef L_dvmd_lnx
+@ GNU/Linux division-by zero handler.  Used in place of L_dvmd_tls
+
+/* Constant taken from <asm/signal.h>.  */
+#define SIGFPE	8
+
+#ifdef __ARM_EABI__
+	WEAK aeabi_idiv0
+	WEAK aeabi_ldiv0
+	ARM_FUNC_START aeabi_idiv0
+	ARM_FUNC_START aeabi_ldiv0
+#else
+	ARM_FUNC_START div0
+#endif
+
+	do_push	{r1, lr}
+	mov	r0, #SIGFPE
+	bl	SYM(raise) __PLT__
+	RETLDM	r1
+
+#ifdef __ARM_EABI__
+	FUNC_END aeabi_ldiv0
+	FUNC_END aeabi_idiv0
+#else
+	FUNC_END div0
+#endif
+	
+#endif /* L_dvmd_lnx */
+#ifdef L_clear_cache
+#if defined __ARM_EABI__ && defined __linux__
+@ EABI GNU/Linux call to cacheflush syscall.
+	ARM_FUNC_START clear_cache
+	do_push	{r7}
+#if __ARM_ARCH__ >= 7 || defined(__ARM_ARCH_6T2__)
+	movw	r7, #2
+	movt	r7, #0xf
+#else
+	mov	r7, #0xf0000
+	add	r7, r7, #2
+#endif
+	mov	r2, #0
+	swi	0
+	do_pop	{r7}
+	RET
+	FUNC_END clear_cache
+#else
+#error "This is only for ARM EABI GNU/Linux"
+#endif
+#endif /* L_clear_cache */
+/* ------------------------------------------------------------------------ */
+/* Dword shift operations.  */
+/* All the following Dword shift variants rely on the fact that
+	shft xxx, Reg
+   is in fact done as
+	shft xxx, (Reg & 255)
+   so for Reg value in (32...63) and (-1...-31) we will get zero (in the
+   case of logical shifts) or the sign (for asr).  */
+
+#ifdef __ARMEB__
+#define al	r1
+#define ah	r0
+#else
+#define al	r0
+#define ah	r1
+#endif
+
+/* Prevent __aeabi double-word shifts from being produced on SymbianOS.  */
+#ifndef __symbian__
+
+#ifdef L_lshrdi3
+
+	FUNC_START lshrdi3
+	FUNC_ALIAS aeabi_llsr lshrdi3
+	
+#ifdef __thumb__
+	lsr	al, r2
+	mov	r3, ah
+	lsr	ah, r2
+	mov	ip, r3
+	sub	r2, #32
+	lsr	r3, r2
+	orr	al, r3
+	neg	r2, r2
+	mov	r3, ip
+	lsl	r3, r2
+	orr	al, r3
+	RET
+#else
+	subs	r3, r2, #32
+	rsb	ip, r2, #32
+	movmi	al, al, lsr r2
+	movpl	al, ah, lsr r3
+	orrmi	al, al, ah, lsl ip
+	mov	ah, ah, lsr r2
+	RET
+#endif
+	FUNC_END aeabi_llsr
+	FUNC_END lshrdi3
+
+#endif
+	
+#ifdef L_ashrdi3
+	
+	FUNC_START ashrdi3
+	FUNC_ALIAS aeabi_lasr ashrdi3
+	
+#ifdef __thumb__
+	lsr	al, r2
+	mov	r3, ah
+	asr	ah, r2
+	sub	r2, #32
+	@ If r2 is negative at this point the following step would OR
+	@ the sign bit into all of AL.  That's not what we want...
+	bmi	1f
+	mov	ip, r3
+	asr	r3, r2
+	orr	al, r3
+	mov	r3, ip
+1:
+	neg	r2, r2
+	lsl	r3, r2
+	orr	al, r3
+	RET
+#else
+	subs	r3, r2, #32
+	rsb	ip, r2, #32
+	movmi	al, al, lsr r2
+	movpl	al, ah, asr r3
+	orrmi	al, al, ah, lsl ip
+	mov	ah, ah, asr r2
+	RET
+#endif
+
+	FUNC_END aeabi_lasr
+	FUNC_END ashrdi3
+
+#endif
+
+#ifdef L_ashldi3
+
+	FUNC_START ashldi3
+	FUNC_ALIAS aeabi_llsl ashldi3
+	
+#ifdef __thumb__
+	lsl	ah, r2
+	mov	r3, al
+	lsl	al, r2
+	mov	ip, r3
+	sub	r2, #32
+	lsl	r3, r2
+	orr	ah, r3
+	neg	r2, r2
+	mov	r3, ip
+	lsr	r3, r2
+	orr	ah, r3
+	RET
+#else
+	subs	r3, r2, #32
+	rsb	ip, r2, #32
+	movmi	ah, ah, lsl r2
+	movpl	ah, al, lsl r3
+	orrmi	ah, ah, al, lsr ip
+	mov	al, al, lsl r2
+	RET
+#endif
+	FUNC_END aeabi_llsl
+	FUNC_END ashldi3
+
+#endif
+
+#endif /* __symbian__ */
+
+#if ((__ARM_ARCH__ > 5) && !defined(__ARM_ARCH_6M__)) \
+    || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
+    || defined(__ARM_ARCH_5TEJ__)
+#define HAVE_ARM_CLZ 1
+#endif
+
+#ifdef L_clzsi2
+#if defined(__ARM_ARCH_6M__)
+FUNC_START clzsi2
+	mov	r1, #28
+	mov	r3, #1
+	lsl	r3, r3, #16
+	cmp	r0, r3 /* 0x10000 */
+	bcc	2f
+	lsr	r0, r0, #16
+	sub	r1, r1, #16
+2:	lsr	r3, r3, #8
+	cmp	r0, r3 /* #0x100 */
+	bcc	2f
+	lsr	r0, r0, #8
+	sub	r1, r1, #8
+2:	lsr	r3, r3, #4
+	cmp	r0, r3 /* #0x10 */
+	bcc	2f
+	lsr	r0, r0, #4
+	sub	r1, r1, #4
+2:	adr	r2, 1f
+	ldrb	r0, [r2, r0]
+	add	r0, r0, r1
+	bx lr
+.align 2
+1:
+.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+	FUNC_END clzsi2
+#else
+ARM_FUNC_START clzsi2
+# if defined(HAVE_ARM_CLZ)
+	clz	r0, r0
+	RET
+# else
+	mov	r1, #28
+	cmp	r0, #0x10000
+	do_it	cs, t
+	movcs	r0, r0, lsr #16
+	subcs	r1, r1, #16
+	cmp	r0, #0x100
+	do_it	cs, t
+	movcs	r0, r0, lsr #8
+	subcs	r1, r1, #8
+	cmp	r0, #0x10
+	do_it	cs, t
+	movcs	r0, r0, lsr #4
+	subcs	r1, r1, #4
+	adr	r2, 1f
+	ldrb	r0, [r2, r0]
+	add	r0, r0, r1
+	RET
+.align 2
+1:
+.byte 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+# endif /* !HAVE_ARM_CLZ */
+	FUNC_END clzsi2
+#endif
+#endif /* L_clzsi2 */
+
+#ifdef L_clzdi2
+#if !defined(HAVE_ARM_CLZ)
+
+# if defined(__ARM_ARCH_6M__)
+FUNC_START clzdi2
+	push	{r4, lr}
+# else
+ARM_FUNC_START clzdi2
+	do_push	{r4, lr}
+# endif
+	cmp	xxh, #0
+	bne	1f
+# ifdef __ARMEB__
+	mov	r0, xxl
+	bl	__clzsi2
+	add	r0, r0, #32
+	b 2f
+1:
+	bl	__clzsi2
+# else
+	bl	__clzsi2
+	add	r0, r0, #32
+	b 2f
+1:
+	mov	r0, xxh
+	bl	__clzsi2
+# endif
+2:
+# if defined(__ARM_ARCH_6M__)
+	pop	{r4, pc}
+# else
+	RETLDM	r4
+# endif
+	FUNC_END clzdi2
+
+#else /* HAVE_ARM_CLZ */
+
+ARM_FUNC_START clzdi2
+	cmp	xxh, #0
+	do_it	eq, et
+	clzeq	r0, xxl
+	clzne	r0, xxh
+	addeq	r0, r0, #32
+	RET
+	FUNC_END clzdi2
+
+#endif
+#endif /* L_clzdi2 */
+
+/* ------------------------------------------------------------------------ */
+/* These next two sections are here despite the fact that they contain Thumb 
+   assembler because their presence allows interworked code to be linked even
+   when the GCC library is this one.  */
+		
+/* Do not build the interworking functions when the target architecture does 
+   not support Thumb instructions.  (This can be a multilib option).  */
+#if defined __ARM_ARCH_4T__ || defined __ARM_ARCH_5T__\
+      || defined __ARM_ARCH_5TE__ || defined __ARM_ARCH_5TEJ__ \
+      || __ARM_ARCH__ >= 6
+
+#if defined L_call_via_rX
+
+/* These labels & instructions are used by the Arm/Thumb interworking code. 
+   The address of function to be called is loaded into a register and then 
+   one of these labels is called via a BL instruction.  This puts the 
+   return address into the link register with the bottom bit set, and the 
+   code here switches to the correct mode before executing the function.  */
+	
+	.text
+	.align 0
+        .force_thumb
+
+.macro call_via register
+	THUMB_FUNC_START _call_via_\register
+
+	bx	\register
+	nop
+
+	SIZE	(_call_via_\register)
+.endm
+
+	call_via r0
+	call_via r1
+	call_via r2
+	call_via r3
+	call_via r4
+	call_via r5
+	call_via r6
+	call_via r7
+	call_via r8
+	call_via r9
+	call_via sl
+	call_via fp
+	call_via ip
+	call_via sp
+	call_via lr
+
+#endif /* L_call_via_rX */
+
+/* Don't bother with the old interworking routines for Thumb-2.  */
+/* ??? Maybe only omit these on "m" variants.  */
+#if !defined(__thumb2__) && !defined(__ARM_ARCH_6M__)
+
+#if defined L_interwork_call_via_rX
+
+/* These labels & instructions are used by the Arm/Thumb interworking code,
+   when the target address is in an unknown instruction set.  The address 
+   of function to be called is loaded into a register and then one of these
+   labels is called via a BL instruction.  This puts the return address 
+   into the link register with the bottom bit set, and the code here 
+   switches to the correct mode before executing the function.  Unfortunately
+   the target code cannot be relied upon to return via a BX instruction, so
+   instead we have to store the resturn address on the stack and allow the
+   called function to return here instead.  Upon return we recover the real
+   return address and use a BX to get back to Thumb mode.
+
+   There are three variations of this code.  The first,
+   _interwork_call_via_rN(), will push the return address onto the
+   stack and pop it in _arm_return().  It should only be used if all
+   arguments are passed in registers.
+
+   The second, _interwork_r7_call_via_rN(), instead stores the return
+   address at [r7, #-4].  It is the caller's responsibility to ensure
+   that this address is valid and contains no useful data.
+
+   The third, _interwork_r11_call_via_rN(), works in the same way but
+   uses r11 instead of r7.  It is useful if the caller does not really
+   need a frame pointer.  */
+	
+	.text
+	.align 0
+
+	.code   32
+	.globl _arm_return
+LSYM(Lstart_arm_return):
+	cfi_start	LSYM(Lstart_arm_return) LSYM(Lend_arm_return)
+	cfi_push	0, 0xe, -0x8, 0x8
+	nop	@ This nop is for the benefit of debuggers, so that
+		@ backtraces will use the correct unwind information.
+_arm_return:
+	RETLDM	unwind=LSYM(Lstart_arm_return)
+	cfi_end	LSYM(Lend_arm_return)
+
+	.globl _arm_return_r7
+_arm_return_r7:
+	ldr	lr, [r7, #-4]
+	bx	lr
+
+	.globl _arm_return_r11
+_arm_return_r11:
+	ldr	lr, [r11, #-4]
+	bx	lr
+
+.macro interwork_with_frame frame, register, name, return
+	.code	16
+
+	THUMB_FUNC_START \name
+
+	bx	pc
+	nop
+
+	.code	32
+	tst	\register, #1
+	streq	lr, [\frame, #-4]
+	adreq	lr, _arm_return_\frame
+	bx	\register
+
+	SIZE	(\name)
+.endm
+
+.macro interwork register
+	.code	16
+
+	THUMB_FUNC_START _interwork_call_via_\register
+
+	bx	pc
+	nop
+
+	.code	32
+	.globl LSYM(Lchange_\register)
+LSYM(Lchange_\register):
+	tst	\register, #1
+	streq	lr, [sp, #-8]!
+	adreq	lr, _arm_return
+	bx	\register
+
+	SIZE	(_interwork_call_via_\register)
+
+	interwork_with_frame r7,\register,_interwork_r7_call_via_\register
+	interwork_with_frame r11,\register,_interwork_r11_call_via_\register
+.endm
+	
+	interwork r0
+	interwork r1
+	interwork r2
+	interwork r3
+	interwork r4
+	interwork r5
+	interwork r6
+	interwork r7
+	interwork r8
+	interwork r9
+	interwork sl
+	interwork fp
+	interwork ip
+	interwork sp
+	
+	/* The LR case has to be handled a little differently...  */
+	.code 16
+
+	THUMB_FUNC_START _interwork_call_via_lr
+
+	bx 	pc
+	nop
+	
+	.code 32
+	.globl .Lchange_lr
+.Lchange_lr:
+	tst	lr, #1
+	stmeqdb	r13!, {lr, pc}
+	mov	ip, lr
+	adreq	lr, _arm_return
+	bx	ip
+	
+	SIZE	(_interwork_call_via_lr)
+	
+#endif /* L_interwork_call_via_rX */
+#endif /* !__thumb2__ */
+
+/* Functions to support compact pic switch tables in thumb1 state.
+   All these routines take an index into the table in r0.  The
+   table is at LR & ~1 (but this must be rounded up in the case
+   of 32-bit entires).  They are only permitted to clobber r12
+   and r14 and r0 must be preserved on exit.  */
+#ifdef L_thumb1_case_sqi
+	
+	.text
+	.align 0
+        .force_thumb
+	.syntax unified
+	THUMB_FUNC_START __gnu_thumb1_case_sqi
+	push	{r1}
+	mov	r1, lr
+	lsrs	r1, r1, #1
+	lsls	r1, r1, #1
+	ldrsb	r1, [r1, r0]
+	lsls	r1, r1, #1
+	add	lr, lr, r1
+	pop	{r1}
+	bx	lr
+	SIZE (__gnu_thumb1_case_sqi)
+#endif
+
+#ifdef L_thumb1_case_uqi
+	
+	.text
+	.align 0
+        .force_thumb
+	.syntax unified
+	THUMB_FUNC_START __gnu_thumb1_case_uqi
+	push	{r1}
+	mov	r1, lr
+	lsrs	r1, r1, #1
+	lsls	r1, r1, #1
+	ldrb	r1, [r1, r0]
+	lsls	r1, r1, #1
+	add	lr, lr, r1
+	pop	{r1}
+	bx	lr
+	SIZE (__gnu_thumb1_case_uqi)
+#endif
+
+#ifdef L_thumb1_case_shi
+	
+	.text
+	.align 0
+        .force_thumb
+	.syntax unified
+	THUMB_FUNC_START __gnu_thumb1_case_shi
+	push	{r0, r1}
+	mov	r1, lr
+	lsrs	r1, r1, #1
+	lsls	r0, r0, #1
+	lsls	r1, r1, #1
+	ldrsh	r1, [r1, r0]
+	lsls	r1, r1, #1
+	add	lr, lr, r1
+	pop	{r0, r1}
+	bx	lr
+	SIZE (__gnu_thumb1_case_shi)
+#endif
+
+#ifdef L_thumb1_case_uhi
+	
+	.text
+	.align 0
+        .force_thumb
+	.syntax unified
+	THUMB_FUNC_START __gnu_thumb1_case_uhi
+	push	{r0, r1}
+	mov	r1, lr
+	lsrs	r1, r1, #1
+	lsls	r0, r0, #1
+	lsls	r1, r1, #1
+	ldrh	r1, [r1, r0]
+	lsls	r1, r1, #1
+	add	lr, lr, r1
+	pop	{r0, r1}
+	bx	lr
+	SIZE (__gnu_thumb1_case_uhi)
+#endif
+
+#ifdef L_thumb1_case_si
+	
+	.text
+	.align 0
+        .force_thumb
+	.syntax unified
+	THUMB_FUNC_START __gnu_thumb1_case_si
+	push	{r0, r1}
+	mov	r1, lr
+	adds.n	r1, r1, #2	/* Align to word.  */
+	lsrs	r1, r1, #2
+	lsls	r0, r0, #2
+	lsls	r1, r1, #2
+	ldr	r0, [r1, r0]
+	adds	r0, r0, r1
+	mov	lr, r0
+	pop	{r0, r1}
+	mov	pc, lr		/* We know we were called from thumb code.  */
+	SIZE (__gnu_thumb1_case_si)
+#endif
+
+#endif /* Arch supports thumb.  */
+
+#ifndef __symbian__
+#ifndef __ARM_ARCH_6M__
+#include "ieee754-df.S"
+#include "ieee754-sf.S"
+#include "bpabi.S"
+#else /* __ARM_ARCH_6M__ */
+#include "bpabi-v6m.S"
+#endif /* __ARM_ARCH_6M__ */
+#endif /* !__symbian__ */
diff --git a/libgcc/config/arm/libunwind.S b/libgcc/config/arm/libunwind.S
index a3a19daab4b..8166cd86e47 100644
--- a/libgcc/config/arm/libunwind.S
+++ b/libgcc/config/arm/libunwind.S
@@ -40,7 +40,7 @@
 
 #ifndef __symbian__
 
-#include "config/arm/lib1funcs.asm"
+#include "lib1funcs.S"
 
 .macro UNPREFIX name
 	.global SYM (\name)
diff --git a/libgcc/config/arm/t-arm b/libgcc/config/arm/t-arm
new file mode 100644
index 00000000000..4e17e99b4a5
--- /dev/null
+++ b/libgcc/config/arm/t-arm
@@ -0,0 +1,3 @@
+LIB1ASMSRC = arm/lib1funcs.S
+LIB1ASMFUNCS = _thumb1_case_sqi _thumb1_case_uqi _thumb1_case_shi \
+	_thumb1_case_uhi _thumb1_case_si
diff --git a/libgcc/config/arm/t-bpabi b/libgcc/config/arm/t-bpabi
index ebb2f9fd85d..8787285ab1f 100644
--- a/libgcc/config/arm/t-bpabi
+++ b/libgcc/config/arm/t-bpabi
@@ -1,3 +1,6 @@
+# Add the bpabi.S functions.
+LIB1ASMFUNCS += _aeabi_lcmp _aeabi_ulcmp _aeabi_ldivmod _aeabi_uldivmod
+
 LIB2ADDEH = $(srcdir)/config/arm/unwind-arm.c \
   $(srcdir)/config/arm/libunwind.S \
   $(srcdir)/config/arm/pr-support.c $(srcdir)/unwind-c.c
diff --git a/libgcc/config/arm/t-elf b/libgcc/config/arm/t-elf
new file mode 100644
index 00000000000..fab32e445be
--- /dev/null
+++ b/libgcc/config/arm/t-elf
@@ -0,0 +1,13 @@
+# For most CPUs we have an assembly soft-float implementations.
+# However this is not true for ARMv6M.  Here we want to use the soft-fp C
+# implementation.  The soft-fp code is only build for ARMv6M.  This pulls
+# in the asm implementation for other CPUs.
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func \
+	_call_via_rX _interwork_call_via_rX \
+	_lshrdi3 _ashrdi3 _ashldi3 \
+	_arm_negdf2 _arm_addsubdf3 _arm_muldivdf3 _arm_cmpdf2 _arm_unorddf2 \
+	_arm_fixdfsi _arm_fixunsdfsi \
+	_arm_truncdfsf2 _arm_negsf2 _arm_addsubsf3 _arm_muldivsf3 \
+	_arm_cmpsf2 _arm_unordsf2 _arm_fixsfsi _arm_fixunssfsi \
+	_arm_floatdidf _arm_floatdisf _arm_floatundidf _arm_floatundisf \
+	_clzsi2 _clzdi2 
diff --git a/libgcc/config/arm/t-linux b/libgcc/config/arm/t-linux
new file mode 100644
index 00000000000..a154f775a0f
--- /dev/null
+++ b/libgcc/config/arm/t-linux
@@ -0,0 +1,3 @@
+LIB1ASMSRC = arm/lib1funcs.S
+LIB1ASMFUNCS = _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_lnx _clzsi2 _clzdi2 \
+	_arm_addsubdf3 _arm_addsubsf3
diff --git a/libgcc/config/arm/t-linux-eabi b/libgcc/config/arm/t-linux-eabi
new file mode 100644
index 00000000000..dfc9197ea45
--- /dev/null
+++ b/libgcc/config/arm/t-linux-eabi
@@ -0,0 +1,2 @@
+# Use a version of div0 which raises SIGFPE, and a special __clear_cache.
+LIB1ASMFUNCS := $(filter-out _dvmd_tls,$(LIB1ASMFUNCS)) _dvmd_lnx _clear_cache
diff --git a/libgcc/config/arm/t-strongarm-elf b/libgcc/config/arm/t-strongarm-elf
new file mode 100644
index 00000000000..cd9f9667ddf
--- /dev/null
+++ b/libgcc/config/arm/t-strongarm-elf
@@ -0,0 +1 @@
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _clzsi2 _clzdi2
diff --git a/libgcc/config/arm/t-symbian b/libgcc/config/arm/t-symbian
index 6788d5f40b3..1989696c8a3 100644
--- a/libgcc/config/arm/t-symbian
+++ b/libgcc/config/arm/t-symbian
@@ -1,2 +1,16 @@
+LIB1ASMFUNCS += _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
+
+# These functions have __aeabi equivalents and will never be called by GCC.  
+# By putting them in LIB1ASMFUNCS, we avoid the standard libgcc2.c code being
+# used -- and we make sure that definitions are not available in lib1funcs.S,
+# either, so they end up undefined.
+LIB1ASMFUNCS += \
+	_ashldi3 _ashrdi3 _divdi3 _floatdidf _udivmoddi4 _umoddi3 \
+	_udivdi3 _lshrdi3 _moddi3 _muldi3 _negdi2 _cmpdi2 \
+	_fixdfdi _fixsfdi _fixunsdfdi _fixunssfdi _floatdisf \
+	_negdf2 _addsubdf3 _muldivdf3 _cmpdf2 _unorddf2 _fixdfsi _fixunsdfsi \
+	_truncdfsf2 _negsf2 _addsubsf3 _muldivsf3 _cmpsf2 _unordsf2 \
+	_fixsfsi _fixunssfsi
+
 # Include the gcc personality routine
 LIB2ADDEH = $(srcdir)/unwind-c.c $(srcdir)/config/arm/pr-support.c
diff --git a/libgcc/config/arm/t-vxworks b/libgcc/config/arm/t-vxworks
new file mode 100644
index 00000000000..70ccdc1556a
--- /dev/null
+++ b/libgcc/config/arm/t-vxworks
@@ -0,0 +1 @@
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _bb_init_func _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
diff --git a/libgcc/config/arm/t-wince-pe b/libgcc/config/arm/t-wince-pe
new file mode 100644
index 00000000000..33ea969ccf4
--- /dev/null
+++ b/libgcc/config/arm/t-wince-pe
@@ -0,0 +1 @@
+LIB1ASMFUNCS += _udivsi3 _divsi3 _umodsi3 _modsi3 _dvmd_tls _call_via_rX _interwork_call_via_rX _clzsi2 _clzdi2
diff --git a/libgcc/config/avr/lib1funcs.S b/libgcc/config/avr/lib1funcs.S
new file mode 100644
index 00000000000..8c369c96a77
--- /dev/null
+++ b/libgcc/config/avr/lib1funcs.S
@@ -0,0 +1,1533 @@
+/*  -*- Mode: Asm -*-  */
+/* Copyright (C) 1998, 1999, 2000, 2007, 2008, 2009
+   Free Software Foundation, Inc.
+   Contributed by Denis Chertykov <chertykov@gmail.com>
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#define __zero_reg__ r1
+#define __tmp_reg__ r0
+#define __SREG__ 0x3f
+#define __SP_H__ 0x3e
+#define __SP_L__ 0x3d
+#define __RAMPZ__ 0x3B
+#define __EIND__  0x3C
+
+/* Most of the functions here are called directly from avr.md
+   patterns, instead of using the standard libcall mechanisms.
+   This can make better code because GCC knows exactly which
+   of the call-used registers (not all of them) are clobbered.  */
+
+/* FIXME:  At present, there is no SORT directive in the linker
+           script so that we must not assume that different modules
+           in the same input section like .libgcc.text.mul will be
+           located close together.  Therefore, we cannot use
+           RCALL/RJMP to call a function like __udivmodhi4 from
+           __divmodhi4 and have to use lengthy XCALL/XJMP even
+           though they are in the same input section and all same
+           input sections together are small enough to reach every
+           location with a RCALL/RJMP instruction.  */
+
+	.macro	mov_l  r_dest, r_src
+#if defined (__AVR_HAVE_MOVW__)
+	movw	\r_dest, \r_src
+#else
+	mov	\r_dest, \r_src
+#endif
+	.endm
+
+	.macro	mov_h  r_dest, r_src
+#if defined (__AVR_HAVE_MOVW__)
+	; empty
+#else
+	mov	\r_dest, \r_src
+#endif
+	.endm
+
+#if defined (__AVR_HAVE_JMP_CALL__)
+#define XCALL call
+#define XJMP  jmp
+#else
+#define XCALL rcall
+#define XJMP  rjmp
+#endif
+
+.macro DEFUN name
+.global \name
+.func \name
+\name:
+.endm
+
+.macro ENDF name
+.size \name, .-\name
+.endfunc
+.endm
+
+
+.section .text.libgcc.mul, "ax", @progbits
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+/* Note: mulqi3, mulhi3 are open-coded on the enhanced core.  */
+#if !defined (__AVR_HAVE_MUL__)
+/*******************************************************
+    Multiplication  8 x 8  without MUL
+*******************************************************/
+#if defined (L_mulqi3)
+
+#define	r_arg2	r22		/* multiplicand */
+#define	r_arg1 	r24		/* multiplier */
+#define r_res	__tmp_reg__	/* result */
+
+DEFUN __mulqi3
+	clr	r_res		; clear result
+__mulqi3_loop:
+	sbrc	r_arg1,0
+	add	r_res,r_arg2
+	add	r_arg2,r_arg2	; shift multiplicand
+	breq	__mulqi3_exit	; while multiplicand != 0
+	lsr	r_arg1		; 
+	brne	__mulqi3_loop	; exit if multiplier = 0
+__mulqi3_exit:	
+	mov	r_arg1,r_res	; result to return register
+	ret
+ENDF __mulqi3
+
+#undef r_arg2  
+#undef r_arg1  
+#undef r_res   
+	
+#endif 	/* defined (L_mulqi3) */
+
+#if defined (L_mulqihi3)
+DEFUN __mulqihi3
+	clr	r25
+	sbrc	r24, 7
+	dec	r25
+	clr	r23
+	sbrc	r22, 7
+	dec	r22
+	XJMP	__mulhi3
+ENDF __mulqihi3:
+#endif /* defined (L_mulqihi3) */
+
+#if defined (L_umulqihi3)
+DEFUN __umulqihi3
+	clr	r25
+	clr	r23
+	XJMP	__mulhi3
+ENDF __umulqihi3
+#endif /* defined (L_umulqihi3) */
+
+/*******************************************************
+    Multiplication  16 x 16  without MUL
+*******************************************************/
+#if defined (L_mulhi3)
+#define	r_arg1L	r24		/* multiplier Low */
+#define	r_arg1H	r25		/* multiplier High */
+#define	r_arg2L	r22		/* multiplicand Low */
+#define	r_arg2H	r23		/* multiplicand High */
+#define r_resL	__tmp_reg__	/* result Low */
+#define r_resH  r21		/* result High */
+
+DEFUN __mulhi3
+	clr	r_resH		; clear result
+	clr	r_resL		; clear result
+__mulhi3_loop:
+	sbrs	r_arg1L,0
+	rjmp	__mulhi3_skip1
+	add	r_resL,r_arg2L	; result + multiplicand
+	adc	r_resH,r_arg2H
+__mulhi3_skip1:	
+	add	r_arg2L,r_arg2L	; shift multiplicand
+	adc	r_arg2H,r_arg2H
+
+	cp	r_arg2L,__zero_reg__
+	cpc	r_arg2H,__zero_reg__
+	breq	__mulhi3_exit	; while multiplicand != 0
+
+	lsr	r_arg1H		; gets LSB of multiplier
+	ror	r_arg1L
+	sbiw	r_arg1L,0
+	brne	__mulhi3_loop	; exit if multiplier = 0
+__mulhi3_exit:
+	mov	r_arg1H,r_resH	; result to return register
+	mov	r_arg1L,r_resL
+	ret
+ENDF __mulhi3
+
+#undef r_arg1L
+#undef r_arg1H
+#undef r_arg2L
+#undef r_arg2H
+#undef r_resL 	
+#undef r_resH 
+
+#endif /* defined (L_mulhi3) */
+
+/*******************************************************
+    Widening Multiplication  32 = 16 x 16  without MUL
+*******************************************************/
+
+#if defined (L_mulhisi3)
+DEFUN __mulhisi3
+;;; FIXME: This is dead code (noone calls it)
+    mov_l   r18, r24
+    mov_h   r19, r25
+    clr     r24
+    sbrc    r23, 7
+    dec     r24
+    mov     r25, r24
+    clr     r20
+    sbrc    r19, 7
+    dec     r20
+    mov     r21, r20
+    XJMP    __mulsi3
+ENDF __mulhisi3
+#endif /* defined (L_mulhisi3) */
+
+#if defined (L_umulhisi3)
+DEFUN __umulhisi3
+;;; FIXME: This is dead code (noone calls it)
+    mov_l   r18, r24
+    mov_h   r19, r25
+    clr     r24
+    clr     r25
+    mov_l   r20, r24
+    mov_h   r21, r25
+    XJMP    __mulsi3
+ENDF __umulhisi3
+#endif /* defined (L_umulhisi3) */
+
+#if defined (L_mulsi3)
+/*******************************************************
+    Multiplication  32 x 32  without MUL
+*******************************************************/
+#define r_arg1L  r22		/* multiplier Low */
+#define r_arg1H  r23
+#define	r_arg1HL r24
+#define	r_arg1HH r25		/* multiplier High */
+
+#define	r_arg2L  r18		/* multiplicand Low */
+#define	r_arg2H  r19	
+#define	r_arg2HL r20
+#define	r_arg2HH r21		/* multiplicand High */
+	
+#define r_resL	 r26		/* result Low */
+#define r_resH   r27
+#define r_resHL	 r30
+#define r_resHH  r31		/* result High */
+
+DEFUN __mulsi3
+	clr	r_resHH		; clear result
+	clr	r_resHL		; clear result
+	clr	r_resH		; clear result
+	clr	r_resL		; clear result
+__mulsi3_loop:
+	sbrs	r_arg1L,0
+	rjmp	__mulsi3_skip1
+	add	r_resL,r_arg2L		; result + multiplicand
+	adc	r_resH,r_arg2H
+	adc	r_resHL,r_arg2HL
+	adc	r_resHH,r_arg2HH
+__mulsi3_skip1:
+	add	r_arg2L,r_arg2L		; shift multiplicand
+	adc	r_arg2H,r_arg2H
+	adc	r_arg2HL,r_arg2HL
+	adc	r_arg2HH,r_arg2HH
+	
+	lsr	r_arg1HH	; gets LSB of multiplier
+	ror	r_arg1HL
+	ror	r_arg1H
+	ror	r_arg1L
+	brne	__mulsi3_loop
+	sbiw	r_arg1HL,0
+	cpc	r_arg1H,r_arg1L
+	brne	__mulsi3_loop		; exit if multiplier = 0
+__mulsi3_exit:
+	mov_h	r_arg1HH,r_resHH	; result to return register
+	mov_l	r_arg1HL,r_resHL
+	mov_h	r_arg1H,r_resH
+	mov_l	r_arg1L,r_resL
+	ret
+ENDF __mulsi3
+
+#undef r_arg1L 
+#undef r_arg1H 
+#undef r_arg1HL
+#undef r_arg1HH
+             
+#undef r_arg2L 
+#undef r_arg2H 
+#undef r_arg2HL
+#undef r_arg2HH
+             
+#undef r_resL  
+#undef r_resH  
+#undef r_resHL 
+#undef r_resHH 
+
+#endif /* defined (L_mulsi3) */
+
+#endif /* !defined (__AVR_HAVE_MUL__) */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+#if defined (__AVR_HAVE_MUL__)    
+#define A0 26
+#define B0 18
+#define C0 22
+
+#define A1 A0+1
+
+#define B1 B0+1
+#define B2 B0+2
+#define B3 B0+3
+
+#define C1 C0+1
+#define C2 C0+2
+#define C3 C0+3
+
+/*******************************************************
+    Widening Multiplication  32 = 16 x 16
+*******************************************************/
+                              
+#if defined (L_mulhisi3)
+;;; R25:R22 = (signed long) R27:R26 * (signed long) R19:R18
+;;; C3:C0   = (signed long) A1:A0   * (signed long) B1:B0
+;;; Clobbers: __tmp_reg__
+DEFUN __mulhisi3
+    XCALL   __umulhisi3
+    ;; Sign-extend B
+    tst     B1
+    brpl    1f
+    sub     C2, A0
+    sbc     C3, A1
+1:  ;; Sign-extend A
+    XJMP __usmulhisi3_tail
+ENDF __mulhisi3
+#endif /* L_mulhisi3 */
+
+#if defined (L_usmulhisi3)
+;;; R25:R22 = (signed long) R27:R26 * (unsigned long) R19:R18
+;;; C3:C0   = (signed long) A1:A0   * (unsigned long) B1:B0
+;;; Clobbers: __tmp_reg__
+DEFUN __usmulhisi3
+    XCALL   __umulhisi3
+    ;; FALLTHRU
+ENDF __usmulhisi3
+
+DEFUN __usmulhisi3_tail
+    ;; Sign-extend A
+    sbrs    A1, 7
+    ret
+    sub     C2, B0
+    sbc     C3, B1
+    ret
+ENDF __usmulhisi3_tail
+#endif /* L_usmulhisi3 */
+
+#if defined (L_umulhisi3)
+;;; R25:R22 = (unsigned long) R27:R26 * (unsigned long) R19:R18
+;;; C3:C0   = (unsigned long) A1:A0   * (unsigned long) B1:B0
+;;; Clobbers: __tmp_reg__
+DEFUN __umulhisi3
+    mul     A0, B0
+    movw    C0, r0
+    mul     A1, B1
+    movw    C2, r0
+    mul     A0, B1
+    rcall   1f
+    mul     A1, B0
+1:  add     C1, r0
+    adc     C2, r1
+    clr     __zero_reg__
+    adc     C3, __zero_reg__
+    ret
+ENDF __umulhisi3
+#endif /* L_umulhisi3 */
+
+/*******************************************************
+    Widening Multiplication  32 = 16 x 32
+*******************************************************/
+
+#if defined (L_mulshisi3)
+;;; R25:R22 = (signed long) R27:R26 * R21:R18
+;;; (C3:C0) = (signed long) A1:A0   * B3:B0
+;;; Clobbers: __tmp_reg__
+DEFUN __mulshisi3
+#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
+    ;; Some cores have problem skipping 2-word instruction
+    tst     A1
+    brmi    __mulohisi3
+#else
+    sbrs    A1, 7
+#endif /* __AVR_HAVE_JMP_CALL__ */
+    XJMP    __muluhisi3
+    ;; FALLTHRU
+ENDF __mulshisi3
+    
+;;; R25:R22 = (one-extended long) R27:R26 * R21:R18
+;;; (C3:C0) = (one-extended long) A1:A0   * B3:B0
+;;; Clobbers: __tmp_reg__
+DEFUN __mulohisi3
+    XCALL   __muluhisi3
+    ;; One-extend R27:R26 (A1:A0)
+    sub     C2, B0
+    sbc     C3, B1
+    ret
+ENDF __mulohisi3
+#endif /* L_mulshisi3 */
+
+#if defined (L_muluhisi3)
+;;; R25:R22 = (unsigned long) R27:R26 * R21:R18
+;;; (C3:C0) = (unsigned long) A1:A0   * B3:B0
+;;; Clobbers: __tmp_reg__
+DEFUN __muluhisi3
+    XCALL   __umulhisi3
+    mul     A0, B3
+    add     C3, r0
+    mul     A1, B2
+    add     C3, r0
+    mul     A0, B2
+    add     C2, r0
+    adc     C3, r1
+    clr     __zero_reg__
+    ret
+ENDF __muluhisi3
+#endif /* L_muluhisi3 */
+
+/*******************************************************
+    Multiplication  32 x 32
+*******************************************************/
+
+#if defined (L_mulsi3)
+;;; R25:R22 = R25:R22 * R21:R18
+;;; (C3:C0) = C3:C0   * B3:B0
+;;; Clobbers: R26, R27, __tmp_reg__
+DEFUN __mulsi3
+    movw    A0, C0
+    push    C2
+    push    C3
+    XCALL   __muluhisi3
+    pop     A1
+    pop     A0
+    ;; A1:A0 now contains the high word of A
+    mul     A0, B0
+    add     C2, r0
+    adc     C3, r1
+    mul     A0, B1
+    add     C3, r0
+    mul     A1, B0
+    add     C3, r0
+    clr     __zero_reg__
+    ret
+ENDF __mulsi3
+#endif /* L_mulsi3 */
+
+#undef A0
+#undef A1
+
+#undef B0
+#undef B1
+#undef B2
+#undef B3
+
+#undef C0
+#undef C1
+#undef C2
+#undef C3
+
+#endif /* __AVR_HAVE_MUL__ */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	
+
+.section .text.libgcc.div, "ax", @progbits
+
+/*******************************************************
+       Division 8 / 8 => (result + remainder)
+*******************************************************/
+#define	r_rem	r25	/* remainder */
+#define	r_arg1	r24	/* dividend, quotient */
+#define	r_arg2	r22	/* divisor */
+#define	r_cnt	r23	/* loop count */
+
+#if defined (L_udivmodqi4)
+DEFUN __udivmodqi4
+	sub	r_rem,r_rem	; clear remainder and carry
+	ldi	r_cnt,9		; init loop counter
+	rjmp	__udivmodqi4_ep	; jump to entry point
+__udivmodqi4_loop:
+	rol	r_rem		; shift dividend into remainder
+	cp	r_rem,r_arg2	; compare remainder & divisor
+	brcs	__udivmodqi4_ep	; remainder <= divisor
+	sub	r_rem,r_arg2	; restore remainder
+__udivmodqi4_ep:
+	rol	r_arg1		; shift dividend (with CARRY)
+	dec	r_cnt		; decrement loop counter
+	brne	__udivmodqi4_loop
+	com	r_arg1		; complement result 
+				; because C flag was complemented in loop
+	ret
+ENDF __udivmodqi4
+#endif /* defined (L_udivmodqi4) */
+
+#if defined (L_divmodqi4)
+DEFUN __divmodqi4
+        bst     r_arg1,7	; store sign of dividend
+        mov     __tmp_reg__,r_arg1
+        eor     __tmp_reg__,r_arg2; r0.7 is sign of result
+        sbrc	r_arg1,7
+	neg     r_arg1		; dividend negative : negate
+        sbrc	r_arg2,7
+	neg     r_arg2		; divisor negative : negate
+	XCALL	__udivmodqi4	; do the unsigned div/mod
+	brtc	__divmodqi4_1
+	neg	r_rem		; correct remainder sign
+__divmodqi4_1:
+	sbrc	__tmp_reg__,7
+	neg	r_arg1		; correct result sign
+__divmodqi4_exit:
+	ret
+ENDF __divmodqi4
+#endif /* defined (L_divmodqi4) */
+
+#undef r_rem
+#undef r_arg1
+#undef r_arg2
+#undef r_cnt
+	
+		
+/*******************************************************
+       Division 16 / 16 => (result + remainder)
+*******************************************************/
+#define	r_remL	r26	/* remainder Low */
+#define	r_remH	r27	/* remainder High */
+
+/* return: remainder */
+#define	r_arg1L	r24	/* dividend Low */
+#define	r_arg1H	r25	/* dividend High */
+
+/* return: quotient */
+#define	r_arg2L	r22	/* divisor Low */
+#define	r_arg2H	r23	/* divisor High */
+	
+#define	r_cnt	r21	/* loop count */
+
+#if defined (L_udivmodhi4)
+DEFUN __udivmodhi4
+	sub	r_remL,r_remL
+	sub	r_remH,r_remH	; clear remainder and carry
+	ldi	r_cnt,17	; init loop counter
+	rjmp	__udivmodhi4_ep	; jump to entry point
+__udivmodhi4_loop:
+        rol	r_remL		; shift dividend into remainder
+	rol	r_remH
+        cp	r_remL,r_arg2L	; compare remainder & divisor
+	cpc	r_remH,r_arg2H
+        brcs	__udivmodhi4_ep	; remainder < divisor
+        sub	r_remL,r_arg2L	; restore remainder
+        sbc	r_remH,r_arg2H
+__udivmodhi4_ep:
+        rol	r_arg1L		; shift dividend (with CARRY)
+        rol	r_arg1H
+        dec	r_cnt		; decrement loop counter
+        brne	__udivmodhi4_loop
+	com	r_arg1L
+	com	r_arg1H
+; div/mod results to return registers, as for the div() function
+	mov_l	r_arg2L, r_arg1L	; quotient
+	mov_h	r_arg2H, r_arg1H
+	mov_l	r_arg1L, r_remL		; remainder
+	mov_h	r_arg1H, r_remH
+	ret
+ENDF __udivmodhi4
+#endif /* defined (L_udivmodhi4) */
+
+#if defined (L_divmodhi4)
+DEFUN __divmodhi4
+	.global	_div
+_div:
+        bst     r_arg1H,7	; store sign of dividend
+        mov     __tmp_reg__,r_arg1H
+        eor     __tmp_reg__,r_arg2H   ; r0.7 is sign of result
+	rcall	__divmodhi4_neg1 ; dividend negative : negate
+	sbrc	r_arg2H,7
+	rcall	__divmodhi4_neg2 ; divisor negative : negate
+	XCALL	__udivmodhi4	; do the unsigned div/mod
+	rcall	__divmodhi4_neg1 ; correct remainder sign
+	tst	__tmp_reg__
+	brpl	__divmodhi4_exit
+__divmodhi4_neg2:
+	com	r_arg2H
+	neg	r_arg2L		; correct divisor/result sign
+	sbci	r_arg2H,0xff
+__divmodhi4_exit:
+	ret
+__divmodhi4_neg1:
+	brtc	__divmodhi4_exit
+	com	r_arg1H
+	neg	r_arg1L		; correct dividend/remainder sign
+	sbci	r_arg1H,0xff
+	ret
+ENDF __divmodhi4
+#endif /* defined (L_divmodhi4) */
+
+#undef r_remH  
+#undef r_remL  
+             
+#undef r_arg1H 
+#undef r_arg1L 
+             
+#undef r_arg2H 
+#undef r_arg2L 
+             	
+#undef r_cnt   	
+	
+/*******************************************************
+       Division 32 / 32 => (result + remainder)
+*******************************************************/
+#define	r_remHH	r31	/* remainder High */
+#define	r_remHL	r30
+#define	r_remH	r27
+#define	r_remL	r26	/* remainder Low */
+
+/* return: remainder */
+#define	r_arg1HH r25	/* dividend High */
+#define	r_arg1HL r24
+#define	r_arg1H  r23
+#define	r_arg1L  r22	/* dividend Low */
+
+/* return: quotient */
+#define	r_arg2HH r21	/* divisor High */
+#define	r_arg2HL r20
+#define	r_arg2H  r19
+#define	r_arg2L  r18	/* divisor Low */
+	
+#define	r_cnt __zero_reg__  /* loop count (0 after the loop!) */
+
+#if defined (L_udivmodsi4)
+DEFUN __udivmodsi4
+	ldi	r_remL, 33	; init loop counter
+	mov	r_cnt, r_remL
+	sub	r_remL,r_remL
+	sub	r_remH,r_remH	; clear remainder and carry
+	mov_l	r_remHL, r_remL
+	mov_h	r_remHH, r_remH
+	rjmp	__udivmodsi4_ep	; jump to entry point
+__udivmodsi4_loop:
+        rol	r_remL		; shift dividend into remainder
+	rol	r_remH
+	rol	r_remHL
+	rol	r_remHH
+        cp	r_remL,r_arg2L	; compare remainder & divisor
+	cpc	r_remH,r_arg2H
+	cpc	r_remHL,r_arg2HL
+	cpc	r_remHH,r_arg2HH
+	brcs	__udivmodsi4_ep	; remainder <= divisor
+        sub	r_remL,r_arg2L	; restore remainder
+        sbc	r_remH,r_arg2H
+        sbc	r_remHL,r_arg2HL
+        sbc	r_remHH,r_arg2HH
+__udivmodsi4_ep:
+        rol	r_arg1L		; shift dividend (with CARRY)
+        rol	r_arg1H
+        rol	r_arg1HL
+        rol	r_arg1HH
+        dec	r_cnt		; decrement loop counter
+        brne	__udivmodsi4_loop
+				; __zero_reg__ now restored (r_cnt == 0)
+	com	r_arg1L
+	com	r_arg1H
+	com	r_arg1HL
+	com	r_arg1HH
+; div/mod results to return registers, as for the ldiv() function
+	mov_l	r_arg2L,  r_arg1L	; quotient
+	mov_h	r_arg2H,  r_arg1H
+	mov_l	r_arg2HL, r_arg1HL
+	mov_h	r_arg2HH, r_arg1HH
+	mov_l	r_arg1L,  r_remL	; remainder
+	mov_h	r_arg1H,  r_remH
+	mov_l	r_arg1HL, r_remHL
+	mov_h	r_arg1HH, r_remHH
+	ret
+ENDF __udivmodsi4
+#endif /* defined (L_udivmodsi4) */
+
+#if defined (L_divmodsi4)
+DEFUN __divmodsi4
+        bst     r_arg1HH,7	; store sign of dividend
+        mov     __tmp_reg__,r_arg1HH
+        eor     __tmp_reg__,r_arg2HH   ; r0.7 is sign of result
+	rcall	__divmodsi4_neg1 ; dividend negative : negate
+	sbrc	r_arg2HH,7
+	rcall	__divmodsi4_neg2 ; divisor negative : negate
+	XCALL	__udivmodsi4	; do the unsigned div/mod
+	rcall	__divmodsi4_neg1 ; correct remainder sign
+	rol	__tmp_reg__
+	brcc	__divmodsi4_exit
+__divmodsi4_neg2:
+	com	r_arg2HH
+	com	r_arg2HL
+	com	r_arg2H
+	neg	r_arg2L		; correct divisor/quotient sign
+	sbci	r_arg2H,0xff
+	sbci	r_arg2HL,0xff
+	sbci	r_arg2HH,0xff
+__divmodsi4_exit:
+	ret
+__divmodsi4_neg1:
+	brtc	__divmodsi4_exit
+	com	r_arg1HH
+	com	r_arg1HL
+	com	r_arg1H
+	neg	r_arg1L		; correct dividend/remainder sign
+	sbci	r_arg1H, 0xff
+	sbci	r_arg1HL,0xff
+	sbci	r_arg1HH,0xff
+	ret
+ENDF __divmodsi4
+#endif /* defined (L_divmodsi4) */
+
+
+.section .text.libgcc.prologue, "ax", @progbits
+    
+/**********************************
+ * This is a prologue subroutine
+ **********************************/
+#if defined (L_prologue)
+
+DEFUN __prologue_saves__
+	push r2
+	push r3
+	push r4
+	push r5
+	push r6
+	push r7
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in	r28,__SP_L__
+	in	r29,__SP_H__
+	sub	r28,r26
+	sbc	r29,r27
+	in	__tmp_reg__,__SREG__
+	cli
+	out	__SP_H__,r29
+	out	__SREG__,__tmp_reg__
+	out	__SP_L__,r28
+#if defined (__AVR_HAVE_EIJMP_EICALL__)
+	eijmp
+#else
+	ijmp
+#endif
+
+ENDF __prologue_saves__
+#endif /* defined (L_prologue) */
+
+/*
+ * This is an epilogue subroutine
+ */
+#if defined (L_epilogue)
+
+DEFUN __epilogue_restores__
+	ldd	r2,Y+18
+	ldd	r3,Y+17
+	ldd	r4,Y+16
+	ldd	r5,Y+15
+	ldd	r6,Y+14
+	ldd	r7,Y+13
+	ldd	r8,Y+12
+	ldd	r9,Y+11
+	ldd	r10,Y+10
+	ldd	r11,Y+9
+	ldd	r12,Y+8
+	ldd	r13,Y+7
+	ldd	r14,Y+6
+	ldd	r15,Y+5
+	ldd	r16,Y+4
+	ldd	r17,Y+3
+	ldd	r26,Y+2
+	ldd	r27,Y+1
+	add	r28,r30
+	adc	r29,__zero_reg__
+	in	__tmp_reg__,__SREG__
+	cli
+	out	__SP_H__,r29
+	out	__SREG__,__tmp_reg__
+	out	__SP_L__,r28
+	mov_l	r28, r26
+	mov_h	r29, r27
+	ret
+ENDF __epilogue_restores__
+#endif /* defined (L_epilogue) */
+
+#ifdef L_exit
+	.section .fini9,"ax",@progbits
+DEFUN _exit
+	.weak	exit
+exit:
+ENDF _exit
+
+	/* Code from .fini8 ... .fini1 sections inserted by ld script.  */
+
+	.section .fini0,"ax",@progbits
+	cli
+__stop_program:
+	rjmp	__stop_program
+#endif /* defined (L_exit) */
+
+#ifdef L_cleanup
+	.weak	_cleanup
+	.func	_cleanup
+_cleanup:
+	ret
+.endfunc
+#endif /* defined (L_cleanup) */
+
+
+.section .text.libgcc, "ax", @progbits
+    
+#ifdef L_tablejump
+DEFUN __tablejump2__
+	lsl	r30
+	rol	r31
+    ;; FALLTHRU
+ENDF __tablejump2__
+
+DEFUN __tablejump__
+#if defined (__AVR_HAVE_LPMX__)
+	lpm __tmp_reg__, Z+
+	lpm r31, Z
+	mov r30, __tmp_reg__
+#if defined (__AVR_HAVE_EIJMP_EICALL__)
+	eijmp
+#else
+	ijmp
+#endif
+
+#else /* !HAVE_LPMX */
+	lpm
+	adiw r30, 1
+	push r0
+	lpm
+	push r0
+#if defined (__AVR_HAVE_EIJMP_EICALL__)
+	in   __tmp_reg__, __EIND__
+	push __tmp_reg__
+#endif
+	ret
+#endif /* !HAVE_LPMX */
+ENDF __tablejump__
+#endif /* defined (L_tablejump) */
+
+#ifdef L_copy_data
+	.section .init4,"ax",@progbits
+DEFUN __do_copy_data
+#if defined(__AVR_HAVE_ELPMX__)
+	ldi	r17, hi8(__data_end)
+	ldi	r26, lo8(__data_start)
+	ldi	r27, hi8(__data_start)
+	ldi	r30, lo8(__data_load_start)
+	ldi	r31, hi8(__data_load_start)
+	ldi	r16, hh8(__data_load_start)
+	out	__RAMPZ__, r16
+	rjmp	.L__do_copy_data_start
+.L__do_copy_data_loop:
+	elpm	r0, Z+
+	st	X+, r0
+.L__do_copy_data_start:
+	cpi	r26, lo8(__data_end)
+	cpc	r27, r17
+	brne	.L__do_copy_data_loop
+#elif  !defined(__AVR_HAVE_ELPMX__) && defined(__AVR_HAVE_ELPM__)
+	ldi	r17, hi8(__data_end)
+	ldi	r26, lo8(__data_start)
+	ldi	r27, hi8(__data_start)
+	ldi	r30, lo8(__data_load_start)
+	ldi	r31, hi8(__data_load_start)
+	ldi	r16, hh8(__data_load_start - 0x10000)
+.L__do_copy_data_carry:
+	inc	r16
+	out	__RAMPZ__, r16
+	rjmp	.L__do_copy_data_start
+.L__do_copy_data_loop:
+	elpm
+	st	X+, r0
+	adiw	r30, 1
+	brcs	.L__do_copy_data_carry
+.L__do_copy_data_start:
+	cpi	r26, lo8(__data_end)
+	cpc	r27, r17
+	brne	.L__do_copy_data_loop
+#elif !defined(__AVR_HAVE_ELPMX__) && !defined(__AVR_HAVE_ELPM__)
+	ldi	r17, hi8(__data_end)
+	ldi	r26, lo8(__data_start)
+	ldi	r27, hi8(__data_start)
+	ldi	r30, lo8(__data_load_start)
+	ldi	r31, hi8(__data_load_start)
+	rjmp	.L__do_copy_data_start
+.L__do_copy_data_loop:
+#if defined (__AVR_HAVE_LPMX__)
+	lpm	r0, Z+
+#else
+	lpm
+	adiw	r30, 1
+#endif
+	st	X+, r0
+.L__do_copy_data_start:
+	cpi	r26, lo8(__data_end)
+	cpc	r27, r17
+	brne	.L__do_copy_data_loop
+#endif /* !defined(__AVR_HAVE_ELPMX__) && !defined(__AVR_HAVE_ELPM__) */
+ENDF __do_copy_data
+#endif /* L_copy_data */
+
+/* __do_clear_bss is only necessary if there is anything in .bss section.  */
+
+#ifdef L_clear_bss
+	.section .init4,"ax",@progbits
+DEFUN __do_clear_bss
+	ldi	r17, hi8(__bss_end)
+	ldi	r26, lo8(__bss_start)
+	ldi	r27, hi8(__bss_start)
+	rjmp	.do_clear_bss_start
+.do_clear_bss_loop:
+	st	X+, __zero_reg__
+.do_clear_bss_start:
+	cpi	r26, lo8(__bss_end)
+	cpc	r27, r17
+	brne	.do_clear_bss_loop
+ENDF __do_clear_bss
+#endif /* L_clear_bss */
+
+/* __do_global_ctors and __do_global_dtors are only necessary
+   if there are any constructors/destructors.  */
+
+#ifdef L_ctors
+	.section .init6,"ax",@progbits
+DEFUN __do_global_ctors
+#if defined(__AVR_HAVE_RAMPZ__)
+	ldi	r17, hi8(__ctors_start)
+	ldi	r28, lo8(__ctors_end)
+	ldi	r29, hi8(__ctors_end)
+	ldi	r16, hh8(__ctors_end)
+	rjmp	.L__do_global_ctors_start
+.L__do_global_ctors_loop:
+	sbiw	r28, 2
+	sbc     r16, __zero_reg__
+	mov_h	r31, r29
+	mov_l	r30, r28
+	out     __RAMPZ__, r16
+	XCALL	__tablejump_elpm__
+.L__do_global_ctors_start:
+	cpi	r28, lo8(__ctors_start)
+	cpc	r29, r17
+	ldi	r24, hh8(__ctors_start)
+	cpc	r16, r24
+	brne	.L__do_global_ctors_loop
+#else
+	ldi	r17, hi8(__ctors_start)
+	ldi	r28, lo8(__ctors_end)
+	ldi	r29, hi8(__ctors_end)
+	rjmp	.L__do_global_ctors_start
+.L__do_global_ctors_loop:
+	sbiw	r28, 2
+	mov_h	r31, r29
+	mov_l	r30, r28
+	XCALL	__tablejump__
+.L__do_global_ctors_start:
+	cpi	r28, lo8(__ctors_start)
+	cpc	r29, r17
+	brne	.L__do_global_ctors_loop
+#endif /* defined(__AVR_HAVE_RAMPZ__) */
+ENDF __do_global_ctors
+#endif /* L_ctors */
+
+#ifdef L_dtors
+	.section .fini6,"ax",@progbits
+DEFUN __do_global_dtors
+#if defined(__AVR_HAVE_RAMPZ__)
+	ldi	r17, hi8(__dtors_end)
+	ldi	r28, lo8(__dtors_start)
+	ldi	r29, hi8(__dtors_start)
+	ldi	r16, hh8(__dtors_start)
+	rjmp	.L__do_global_dtors_start
+.L__do_global_dtors_loop:
+	sbiw	r28, 2
+	sbc     r16, __zero_reg__
+	mov_h	r31, r29
+	mov_l	r30, r28
+	out     __RAMPZ__, r16
+	XCALL	__tablejump_elpm__
+.L__do_global_dtors_start:
+	cpi	r28, lo8(__dtors_end)
+	cpc	r29, r17
+	ldi	r24, hh8(__dtors_end)
+	cpc	r16, r24
+	brne	.L__do_global_dtors_loop
+#else
+	ldi	r17, hi8(__dtors_end)
+	ldi	r28, lo8(__dtors_start)
+	ldi	r29, hi8(__dtors_start)
+	rjmp	.L__do_global_dtors_start
+.L__do_global_dtors_loop:
+	mov_h	r31, r29
+	mov_l	r30, r28
+	XCALL	__tablejump__
+	adiw	r28, 2
+.L__do_global_dtors_start:
+	cpi	r28, lo8(__dtors_end)
+	cpc	r29, r17
+	brne	.L__do_global_dtors_loop
+#endif /* defined(__AVR_HAVE_RAMPZ__) */
+ENDF __do_global_dtors
+#endif /* L_dtors */
+
+.section .text.libgcc, "ax", @progbits
+    
+#ifdef L_tablejump_elpm
+DEFUN __tablejump_elpm__
+#if defined (__AVR_HAVE_ELPM__)
+#if defined (__AVR_HAVE_LPMX__)
+	elpm	__tmp_reg__, Z+
+	elpm	r31, Z
+	mov	r30, __tmp_reg__
+#if defined (__AVR_HAVE_EIJMP_EICALL__)
+	eijmp
+#else
+	ijmp
+#endif
+
+#else
+	elpm
+	adiw	r30, 1
+	push	r0
+	elpm
+	push	r0
+#if defined (__AVR_HAVE_EIJMP_EICALL__)
+	in      __tmp_reg__, __EIND__
+	push    __tmp_reg__
+#endif
+	ret
+#endif
+#endif /* defined (__AVR_HAVE_ELPM__) */
+ENDF __tablejump_elpm__
+#endif /* defined (L_tablejump_elpm) */
+
+
+.section .text.libgcc.builtins, "ax", @progbits
+
+/**********************************
+ * Find first set Bit (ffs)
+ **********************************/
+
+#if defined (L_ffssi2)
+;; find first set bit
+;; r25:r24 = ffs32 (r25:r22)
+;; clobbers: r22, r26
+DEFUN __ffssi2
+    clr  r26
+    tst  r22
+    brne 1f
+    subi r26, -8
+    or   r22, r23
+    brne 1f
+    subi r26, -8
+    or   r22, r24
+    brne 1f
+    subi r26, -8
+    or   r22, r25
+    brne 1f
+    ret
+1:  mov  r24, r22
+    XJMP __loop_ffsqi2
+ENDF __ffssi2
+#endif /* defined (L_ffssi2) */
+
+#if defined (L_ffshi2)
+;; find first set bit
+;; r25:r24 = ffs16 (r25:r24)
+;; clobbers: r26
+DEFUN __ffshi2
+    clr  r26
+#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
+    ;; Some cores have problem skipping 2-word instruction
+    tst  r24
+    breq 2f
+#else
+    cpse r24, __zero_reg__
+#endif /* __AVR_HAVE_JMP_CALL__ */
+1:  XJMP __loop_ffsqi2
+2:  ldi  r26, 8
+    or   r24, r25
+    brne 1b
+    ret
+ENDF __ffshi2
+#endif /* defined (L_ffshi2) */
+
+#if defined (L_loop_ffsqi2)
+;; Helper for ffshi2, ffssi2
+;; r25:r24 = r26 + zero_extend16 (ffs8(r24))
+;; r24 must be != 0
+;; clobbers: r26
+DEFUN __loop_ffsqi2
+    inc  r26
+    lsr  r24
+    brcc __loop_ffsqi2
+    mov  r24, r26
+    clr  r25
+    ret    
+ENDF __loop_ffsqi2
+#endif /* defined (L_loop_ffsqi2) */
+
+
+/**********************************
+ * Count trailing Zeros (ctz)
+ **********************************/
+
+#if defined (L_ctzsi2)
+;; count trailing zeros
+;; r25:r24 = ctz32 (r25:r22)
+;; clobbers: r26, r22
+;; ctz(0) = 255
+;; Note that ctz(0) in undefined for GCC
+DEFUN __ctzsi2
+    XCALL __ffssi2
+    dec  r24
+    ret
+ENDF __ctzsi2
+#endif /* defined (L_ctzsi2) */
+
+#if defined (L_ctzhi2)
+;; count trailing zeros
+;; r25:r24 = ctz16 (r25:r24)
+;; clobbers: r26
+;; ctz(0) = 255
+;; Note that ctz(0) in undefined for GCC
+DEFUN __ctzhi2
+    XCALL __ffshi2
+    dec  r24
+    ret
+ENDF __ctzhi2
+#endif /* defined (L_ctzhi2) */
+
+
+/**********************************
+ * Count leading Zeros (clz)
+ **********************************/
+
+#if defined (L_clzdi2)
+;; count leading zeros
+;; r25:r24 = clz64 (r25:r18)
+;; clobbers: r22, r23, r26
+DEFUN __clzdi2
+    XCALL __clzsi2
+    sbrs r24, 5
+    ret
+    mov_l r22, r18
+    mov_h r23, r19
+    mov_l r24, r20
+    mov_h r25, r21
+    XCALL __clzsi2
+    subi r24, -32
+    ret
+ENDF __clzdi2
+#endif /* defined (L_clzdi2) */
+
+#if defined (L_clzsi2)
+;; count leading zeros
+;; r25:r24 = clz32 (r25:r22)
+;; clobbers: r26
+DEFUN __clzsi2
+    XCALL __clzhi2
+    sbrs r24, 4
+    ret
+    mov_l r24, r22
+    mov_h r25, r23
+    XCALL __clzhi2
+    subi r24, -16
+    ret
+ENDF __clzsi2
+#endif /* defined (L_clzsi2) */
+
+#if defined (L_clzhi2)
+;; count leading zeros
+;; r25:r24 = clz16 (r25:r24)
+;; clobbers: r26
+DEFUN __clzhi2
+    clr  r26
+    tst  r25
+    brne 1f
+    subi r26, -8
+    or   r25, r24
+    brne 1f
+    ldi  r24, 16
+    ret
+1:  cpi  r25, 16
+    brsh 3f
+    subi r26, -3
+    swap r25
+2:  inc  r26
+3:  lsl  r25
+    brcc 2b
+    mov  r24, r26
+    clr  r25
+    ret
+ENDF __clzhi2
+#endif /* defined (L_clzhi2) */
+
+
+/**********************************
+ * Parity 
+ **********************************/
+
+#if defined (L_paritydi2)
+;; r25:r24 = parity64 (r25:r18)
+;; clobbers: __tmp_reg__
+DEFUN __paritydi2
+    eor  r24, r18
+    eor  r24, r19
+    eor  r24, r20
+    eor  r24, r21
+    XJMP __paritysi2
+ENDF __paritydi2
+#endif /* defined (L_paritydi2) */
+
+#if defined (L_paritysi2)
+;; r25:r24 = parity32 (r25:r22)
+;; clobbers: __tmp_reg__
+DEFUN __paritysi2
+    eor  r24, r22
+    eor  r24, r23
+    XJMP __parityhi2
+ENDF __paritysi2
+#endif /* defined (L_paritysi2) */
+
+#if defined (L_parityhi2)
+;; r25:r24 = parity16 (r25:r24)
+;; clobbers: __tmp_reg__
+DEFUN __parityhi2
+    eor  r24, r25
+;; FALLTHRU
+ENDF __parityhi2
+
+;; r25:r24 = parity8 (r24)
+;; clobbers: __tmp_reg__
+DEFUN __parityqi2
+    ;; parity is in r24[0..7]
+    mov  __tmp_reg__, r24
+    swap __tmp_reg__
+    eor  r24, __tmp_reg__
+    ;; parity is in r24[0..3]
+    subi r24, -4
+    andi r24, -5
+    subi r24, -6
+    ;; parity is in r24[0,3]
+    sbrc r24, 3
+    inc  r24
+    ;; parity is in r24[0]
+    andi r24, 1
+    clr  r25
+    ret
+ENDF __parityqi2
+#endif /* defined (L_parityhi2) */
+
+
+/**********************************
+ * Population Count
+ **********************************/
+
+#if defined (L_popcounthi2)
+;; population count
+;; r25:r24 = popcount16 (r25:r24)
+;; clobbers: __tmp_reg__
+DEFUN __popcounthi2
+    XCALL __popcountqi2
+    push r24
+    mov  r24, r25
+    XCALL __popcountqi2
+    clr  r25
+    ;; FALLTHRU
+ENDF __popcounthi2
+
+DEFUN __popcounthi2_tail
+    pop   __tmp_reg__
+    add   r24, __tmp_reg__
+    ret
+ENDF __popcounthi2_tail
+#endif /* defined (L_popcounthi2) */
+
+#if defined (L_popcountsi2)
+;; population count
+;; r25:r24 = popcount32 (r25:r22)
+;; clobbers: __tmp_reg__
+DEFUN __popcountsi2
+    XCALL __popcounthi2
+    push  r24
+    mov_l r24, r22
+    mov_h r25, r23
+    XCALL __popcounthi2
+    XJMP  __popcounthi2_tail
+ENDF __popcountsi2
+#endif /* defined (L_popcountsi2) */
+
+#if defined (L_popcountdi2)
+;; population count
+;; r25:r24 = popcount64 (r25:r18)
+;; clobbers: r22, r23, __tmp_reg__
+DEFUN __popcountdi2
+    XCALL __popcountsi2
+    push  r24
+    mov_l r22, r18
+    mov_h r23, r19
+    mov_l r24, r20
+    mov_h r25, r21
+    XCALL __popcountsi2
+    XJMP  __popcounthi2_tail
+ENDF __popcountdi2
+#endif /* defined (L_popcountdi2) */
+
+#if defined (L_popcountqi2)
+;; population count
+;; r24 = popcount8 (r24)
+;; clobbers: __tmp_reg__
+DEFUN __popcountqi2
+    mov  __tmp_reg__, r24
+    andi r24, 1
+    lsr  __tmp_reg__    
+    lsr  __tmp_reg__    
+    adc  r24, __zero_reg__
+    lsr  __tmp_reg__    
+    adc  r24, __zero_reg__
+    lsr  __tmp_reg__    
+    adc  r24, __zero_reg__
+    lsr  __tmp_reg__    
+    adc  r24, __zero_reg__
+    lsr  __tmp_reg__    
+    adc  r24, __zero_reg__
+    lsr  __tmp_reg__    
+    adc  r24, __tmp_reg__    
+    ret    
+ENDF __popcountqi2
+#endif /* defined (L_popcountqi2) */
+
+
+/**********************************
+ * Swap bytes
+ **********************************/
+
+;; swap two registers with different register number
+.macro bswap a, b
+    eor \a, \b
+    eor \b, \a
+    eor \a, \b
+.endm
+
+#if defined (L_bswapsi2)
+;; swap bytes
+;; r25:r22 = bswap32 (r25:r22)
+DEFUN __bswapsi2
+    bswap r22, r25
+    bswap r23, r24
+    ret
+ENDF __bswapsi2
+#endif /* defined (L_bswapsi2) */
+
+#if defined (L_bswapdi2)
+;; swap bytes
+;; r25:r18 = bswap64 (r25:r18)
+DEFUN __bswapdi2
+    bswap r18, r25
+    bswap r19, r24
+    bswap r20, r23
+    bswap r21, r22
+    ret
+ENDF __bswapdi2
+#endif /* defined (L_bswapdi2) */
+
+
+/**********************************
+ * 64-bit shifts
+ **********************************/
+
+#if defined (L_ashrdi3)
+;; Arithmetic shift right
+;; r25:r18 = ashr64 (r25:r18, r17:r16)
+DEFUN __ashrdi3
+    push r16
+    andi r16, 63
+    breq 2f
+1:  asr  r25
+    ror  r24
+    ror  r23
+    ror  r22
+    ror  r21
+    ror  r20
+    ror  r19
+    ror  r18
+    dec  r16
+    brne 1b
+2:  pop  r16
+    ret
+ENDF __ashrdi3
+#endif /* defined (L_ashrdi3) */
+
+#if defined (L_lshrdi3)
+;; Logic shift right
+;; r25:r18 = lshr64 (r25:r18, r17:r16)
+DEFUN __lshrdi3
+    push r16
+    andi r16, 63
+    breq 2f
+1:  lsr  r25
+    ror  r24
+    ror  r23
+    ror  r22
+    ror  r21
+    ror  r20
+    ror  r19
+    ror  r18
+    dec  r16
+    brne 1b
+2:  pop  r16
+    ret
+ENDF __lshrdi3
+#endif /* defined (L_lshrdi3) */
+
+#if defined (L_ashldi3)
+;; Shift left
+;; r25:r18 = ashl64 (r25:r18, r17:r16)
+DEFUN __ashldi3
+    push r16
+    andi r16, 63
+    breq 2f
+1:  lsl  r18
+    rol  r19
+    rol  r20
+    rol  r21
+    rol  r22
+    rol  r23
+    rol  r24
+    rol  r25
+    dec  r16
+    brne 1b
+2:  pop  r16
+    ret
+ENDF __ashldi3
+#endif /* defined (L_ashldi3) */
+
+
+.section .text.libgcc.fmul, "ax", @progbits
+
+/***********************************************************/    
+;;; Softmul versions of FMUL, FMULS and FMULSU to implement
+;;; __builtin_avr_fmul* if !AVR_HAVE_MUL
+/***********************************************************/    
+
+#define A1 24
+#define B1 25
+#define C0 22
+#define C1 23
+#define A0 __tmp_reg__
+
+#ifdef L_fmuls
+;;; r23:r22 = fmuls (r24, r25) like in FMULS instruction
+;;; Clobbers: r24, r25, __tmp_reg__
+DEFUN __fmuls
+    ;; A0.7 = negate result?
+    mov  A0, A1
+    eor  A0, B1
+    ;; B1 = |B1|
+    sbrc B1, 7
+    neg  B1
+    XJMP __fmulsu_exit
+ENDF __fmuls
+#endif /* L_fmuls */
+
+#ifdef L_fmulsu
+;;; r23:r22 = fmulsu (r24, r25) like in FMULSU instruction
+;;; Clobbers: r24, r25, __tmp_reg__
+DEFUN __fmulsu
+    ;; A0.7 = negate result?
+    mov  A0, A1
+;; FALLTHRU
+ENDF __fmulsu
+
+;; Helper for __fmuls and __fmulsu
+DEFUN __fmulsu_exit
+    ;; A1 = |A1|
+    sbrc A1, 7
+    neg  A1
+#ifdef __AVR_ERRATA_SKIP_JMP_CALL__
+    ;; Some cores have problem skipping 2-word instruction
+    tst  A0
+    brmi 1f
+#else
+    sbrs A0, 7
+#endif /* __AVR_HAVE_JMP_CALL__ */
+    XJMP  __fmul
+1:  XCALL __fmul
+    ;; C = -C iff A0.7 = 1
+    com  C1
+    neg  C0
+    sbci C1, -1
+    ret
+ENDF __fmulsu_exit
+#endif /* L_fmulsu */
+
+
+#ifdef L_fmul
+;;; r22:r23 = fmul (r24, r25) like in FMUL instruction
+;;; Clobbers: r24, r25, __tmp_reg__
+DEFUN __fmul
+    ; clear result
+    clr   C0
+    clr   C1
+    clr   A0
+1:  tst   B1
+    ;; 1.0 = 0x80, so test for bit 7 of B to see if A must to be added to C.
+2:  brpl  3f
+    ;; C += A
+    add   C0, A0
+    adc   C1, A1
+3:  ;; A >>= 1
+    lsr   A1
+    ror   A0
+    ;; B <<= 1
+    lsl   B1
+    brne  2b
+    ret
+ENDF __fmul
+#endif /* L_fmul */
+
+#undef A0
+#undef A1
+#undef B1
+#undef C0
+#undef C1
diff --git a/libgcc/config/avr/t-avr b/libgcc/config/avr/t-avr
index 78829c76af4..f1c114a6dd6 100644
--- a/libgcc/config/avr/t-avr
+++ b/libgcc/config/avr/t-avr
@@ -1,3 +1,51 @@
+LIB1ASMSRC = avr/lib1funcs.S
+LIB1ASMFUNCS = \
+	_mulqi3 \
+	_mulhi3 \
+	_mulhisi3 \
+	_umulhisi3 \
+	_usmulhisi3 \
+	_muluhisi3 \
+	_mulshisi3 \
+	_mulsi3 \
+	_udivmodqi4 \
+	_divmodqi4 \
+	_udivmodhi4 \
+	_divmodhi4 \
+	_udivmodsi4 \
+	_divmodsi4 \
+	_prologue \
+	_epilogue \
+	_exit \
+	_cleanup \
+	_tablejump \
+	_tablejump_elpm \
+	_copy_data \
+	_clear_bss \
+	_ctors \
+	_dtors \
+	_ffssi2 \
+	_ffshi2 \
+	_loop_ffsqi2 \
+	_ctzsi2 \
+	_ctzhi2 \
+	_clzdi2 \
+	_clzsi2 \
+	_clzhi2 \
+	_paritydi2 \
+	_paritysi2 \
+	_parityhi2 \
+	_popcounthi2 \
+	_popcountsi2 \
+	_popcountdi2 \
+	_popcountqi2 \
+	_bswapsi2 \
+	_bswapdi2 \
+	_ashldi3 \
+	_ashrdi3 \
+	_lshrdi3 \
+	_fmul _fmuls _fmulsu
+
 # Extra 16-bit integer functions.
 intfuncs16 = _absvXX2 _addvXX3 _subvXX3 _mulvXX3 _negvXX2 _clrsbXX2
 
diff --git a/libgcc/config/bfin/lib1funcs.S b/libgcc/config/bfin/lib1funcs.S
new file mode 100644
index 00000000000..c7bf4f3f05c
--- /dev/null
+++ b/libgcc/config/bfin/lib1funcs.S
@@ -0,0 +1,211 @@
+/* libgcc functions for Blackfin.
+   Copyright (C) 2005, 2009 Free Software Foundation, Inc.
+   Contributed by Analog Devices.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifdef L_divsi3
+.text
+.align 2
+.global ___divsi3;
+.type ___divsi3, STT_FUNC;
+
+___divsi3:
+        [--SP]= RETS;
+	[--SP] = R7;
+
+	R2 = -R0;
+        CC = R0 < 0;
+	IF CC R0 = R2;
+	R7 = CC;
+
+	R2 = -R1;
+        CC = R1 < 0;
+	IF CC R1 = R2;
+	R2 = CC;
+	R7 = R7 ^ R2;
+
+        CALL ___udivsi3;
+
+	CC = R7;
+	R1 = -R0;
+	IF CC R0 = R1;
+
+	R7 = [SP++];
+        RETS = [SP++];
+        RTS;
+#endif
+
+#ifdef L_modsi3	
+.align 2
+.global ___modsi3;
+.type ___modsi3, STT_FUNC;
+
+___modsi3:
+	[--SP] = RETS;
+	[--SP] = R0;
+	[--SP] = R1;
+	CALL ___divsi3;
+	R2 = [SP++];
+	R1 = [SP++];
+	R2 *= R0;
+	R0 = R1 - R2;
+	RETS = [SP++];
+	RTS; 
+#endif
+
+#ifdef L_udivsi3
+.align 2
+.global ___udivsi3;
+.type ___udivsi3, STT_FUNC;
+
+___udivsi3:
+        P0 = 32;
+        LSETUP (0f, 1f) LC0 = P0;
+	/* upper half of dividend */
+        R3 = 0;
+0:
+	/* The first time round in the loop we shift in garbage, but since we
+	   perform 33 shifts, it doesn't matter.  */
+	R0 = ROT R0 BY 1;
+	R3 = ROT R3 BY 1;
+	R2 = R3 - R1;
+        CC = R3 < R1 (IU);
+1:
+	/* Last instruction of the loop.  */
+	IF ! CC R3 = R2;
+
+	/* Shift in the last bit.  */
+	R0 = ROT R0 BY 1;
+	/* R0 is the result, R3 contains the remainder.  */
+	R0 = ~ R0;
+        RTS;
+#endif
+
+#ifdef L_umodsi3
+.align 2
+.global ___umodsi3;
+.type ___umodsi3, STT_FUNC;
+
+___umodsi3:
+	[--SP] = RETS;
+	CALL ___udivsi3;
+	R0 = R3;
+	RETS = [SP++]; 
+	RTS;
+#endif
+
+#ifdef L_umulsi3_highpart
+.align 2
+.global ___umulsi3_highpart;
+.type ___umulsi3_highpart, STT_FUNC;
+
+___umulsi3_highpart:
+	A1 = R1.L * R0.L (FU);
+	A1 = A1 >> 16;
+	A0 = R1.H * R0.H, A1 += R1.L * R0.H (FU);
+	A1 += R0.L * R1.H (FU);
+	A1 = A1 >> 16;
+	A0 += A1;
+	R0 = A0 (FU);
+	RTS;
+#endif
+
+#ifdef L_smulsi3_highpart
+.align 2
+.global ___smulsi3_highpart;
+.type ___smulsi3_highpart, STT_FUNC;
+
+___smulsi3_highpart:
+	A1 = R1.L * R0.L (FU);
+	A1 = A1 >> 16;
+	A0 = R0.H * R1.H, A1 += R0.H * R1.L (IS,M);
+	A1 += R1.H * R0.L (IS,M);
+	A1 = A1 >>> 16;
+	R0 = (A0 += A1);
+	RTS;
+#endif
+
+#ifdef L_muldi3
+.align 2
+.global ___muldi3;
+.type ___muldi3, STT_FUNC;
+
+/*
+	   R1:R0 * R3:R2
+	 = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
+[X]	 = (R1.h * R3.h) * 2^96
+[X]	   + (R1.h * R3.l + R1.l * R3.h) * 2^80
+[X]	   + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
+[T1]	   + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
+[T2]	   + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
+[T3]	   + (R0.l * R2.h + R2.l * R0.h) * 2^16
+[T4]	   + (R0.l * R2.l)
+
+	We can discard the first three lines marked "X" since we produce
+	only a 64 bit result.  So, we need ten 16-bit multiplies.
+
+	Individual mul-acc results:
+[E1]	 =  R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
+[E2]	 =  R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
+[E3]	 =  R0.l * R2.h + R2.l * R0.h
+[E4]	 =  R0.l * R2.l
+
+	We also need to add high parts from lower-level results to higher ones:
+	E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
+
+	One interesting property is that all parts of the result that depend
+	on the sign of the multiplication are discarded.  Those would be the
+	multiplications involving R1.h and R3.h, but only the top 16 bit of
+	the 32 bit result depend on the sign, and since R1.h and R3.h only
+	occur in E1, the top half of these results is cut off.
+	So, we can just use FU mode for all of the 16-bit multiplies, and
+	ignore questions of when to use mixed mode.  */
+
+___muldi3:
+	/* [SP] technically is part of the caller's frame, but we can
+	   use it as scratch space.  */
+	A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12];	/* E1 */
+	A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4;		/* E1 */
+	A0 += A1;							/* E1 */
+	R4 = A0.w;
+	A0 = R0.l * R3.l (FU);						/* E2 */
+	A0 += R2.l * R1.l (FU);						/* E2 */
+
+	A1 = R2.L * R0.L (FU);						/* E4 */
+	R3 = A1.w;
+	A1 = A1 >> 16;							/* E3c */
+	A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU);			/* E2, E3c */
+	A1 += R0.L * R2.H (FU);						/* E3c */
+	R0 = A1.w;
+	A1 = A1 >> 16;							/* E2c */
+	A0 += A1;							/* E2c */
+	R1 = A0.w;
+
+	/* low(result) = low(E3c):low(E4) */
+	R0 = PACK (R0.l, R3.l);
+	/* high(result) = E2c + (E1 << 16) */
+	R1.h = R1.h + R4.l (NS) || R4 = [SP];
+	RTS;
+
+.size ___muldi3, .-___muldi3
+#endif
diff --git a/libgcc/config/bfin/t-bfin b/libgcc/config/bfin/t-bfin
new file mode 100644
index 00000000000..bc2b088ffc1
--- /dev/null
+++ b/libgcc/config/bfin/t-bfin
@@ -0,0 +1,3 @@
+LIB1ASMSRC = bfin/lib1funcs.S
+LIB1ASMFUNCS = _divsi3 _udivsi3 _umodsi3 _modsi3 _muldi3 _umulsi3_highpart
+LIB1ASMFUNCS += _smulsi3_highpart
diff --git a/libgcc/config/c6x/lib1funcs.S b/libgcc/config/c6x/lib1funcs.S
new file mode 100644
index 00000000000..5bf34474bbd
--- /dev/null
+++ b/libgcc/config/c6x/lib1funcs.S
@@ -0,0 +1,438 @@
+/* Copyright 2010, 2011  Free Software Foundation, Inc.
+   Contributed by Bernd Schmidt <bernds@codesourcery.com>.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+	;; ABI considerations for the divide functions
+	;; The following registers are call-used:
+	;; __c6xabi_divi A0,A1,A2,A4,A6,B0,B1,B2,B4,B5
+	;; __c6xabi_divu A0,A1,A2,A4,A6,B0,B1,B2,B4
+	;; __c6xabi_remi A1,A2,A4,A5,A6,B0,B1,B2,B4
+	;; __c6xabi_remu A1,A4,A5,A7,B0,B1,B2,B4
+	;;
+	;; In our implementation, divu and remu are leaf functions,
+	;; while both divi and remi call into divu.
+	;; A0 is not clobbered by any of the functions.
+	;; divu does not clobber B2 either, which is taken advantage of
+	;; in remi.
+	;; divi uses B5 to hold the original return address during
+	;; the call to divu.
+	;; remi uses B2 and A5 to hold the input values during the
+	;; call to divu.  It stores B3 in on the stack.
+
+#ifdef L_divsi3
+.text
+.align 2
+.global __c6xabi_divi
+.hidden __c6xabi_divi
+.type __c6xabi_divi, STT_FUNC
+
+__c6xabi_divi:
+	call .s2	__c6xabi_divu
+||	mv .d2		B3, B5
+||	cmpgt .l1	0, A4, A1
+||	cmpgt .l2	0, B4, B1
+
+	[A1] neg .l1	A4, A4
+||	[B1] neg .l2	B4, B4
+||	xor .s1x	A1, B1, A1
+
+#ifdef _TMS320C6400
+	[A1] addkpc .s2	1f, B3, 4
+#else
+	[A1] mvkl .s2	1f, B3
+	[A1] mvkh .s2	1f, B3
+	nop		2
+#endif
+1:
+	neg .l1		A4, A4
+||	mv .l2		B3,B5
+||	ret .s2		B5
+	nop		5
+#endif
+
+#if defined L_modsi3 || defined L_divmodsi4
+.align 2
+#ifdef L_modsi3
+#define MOD_OUTPUT_REG A4
+.global __c6xabi_remi
+.hidden __c6xabi_remi
+.type __c6xabi_remi, STT_FUNC
+#else
+#define MOD_OUTPUT_REG A5
+.global __c6xabi_divremi
+.hidden __c6xabi_divremi
+.type __c6xabi_divremi, STT_FUNC
+__c6xabi_divremi:
+#endif
+
+__c6xabi_remi:
+	stw .d2t2	B3, *B15--[2]
+||	cmpgt .l1	0, A4, A1
+||	cmpgt .l2	0, B4, B2
+||	mv .s1		A4, A5
+||	call .s2	__c6xabi_divu
+
+	[A1] neg .l1	A4, A4
+||	[B2] neg .l2	B4, B4
+||	xor .s2x	B2, A1, B0
+||	mv .d2		B4, B2
+
+#ifdef _TMS320C6400
+	[B0] addkpc .s2	1f, B3, 1
+	[!B0] addkpc .s2 2f, B3, 1
+	nop		2
+#else
+	[B0] mvkl .s2	1f,B3
+	[!B0] mvkl .s2	2f,B3
+
+	[B0] mvkh .s2	1f,B3
+	[!B0] mvkh .s2	2f,B3
+#endif
+1:
+	neg .l1		A4, A4
+2:
+	ldw .d2t2	*++B15[2], B3
+
+#ifdef _TMS320C6400_PLUS
+	mpy32 .m1x	A4, B2, A6
+	nop		3
+	ret .s2		B3
+	sub .l1		A5, A6, MOD_OUTPUT_REG
+	nop		4
+#else
+	mpyu .m1x	A4, B2, A1
+	nop		1
+	mpylhu .m1x	A4, B2, A6
+||	mpylhu .m2x	B2, A4, B2
+	nop		1
+	add .l1x	A6, B2, A6
+||	ret .s2		B3
+	shl .s1		A6, 16, A6
+	add .d1		A6, A1, A6
+	sub .l1		A5, A6, MOD_OUTPUT_REG
+	nop		2
+#endif
+
+#endif
+
+#if defined L_udivsi3 || defined L_udivmodsi4
+.align 2
+#ifdef L_udivsi3
+.global __c6xabi_divu
+.hidden __c6xabi_divu
+.type __c6xabi_divu, STT_FUNC
+__c6xabi_divu:
+#else
+.global __c6xabi_divremu
+.hidden __c6xabi_divremu
+.type __c6xabi_divremu, STT_FUNC
+__c6xabi_divremu:
+#endif
+	;; We use a series of up to 31 subc instructions.  First, we find
+	;; out how many leading zero bits there are in the divisor.  This
+	;; gives us both a shift count for aligning (shifting) the divisor
+	;; to the, and the number of times we have to execute subc.
+
+	;; At the end, we have both the remainder and most of the quotient
+	;; in A4.  The top bit of the quotient is computed first and is
+	;; placed in A2.
+
+	;; Return immediately if the dividend is zero.  Setting B4 to 1
+	;; is a trick to allow us to leave the following insns in the jump
+	;; delay slot without affecting the result.
+	mv	.s2x	A4, B1
+
+#ifndef _TMS320C6400
+[!b1]	mvk	.s2	1, B4
+#endif
+[b1]	lmbd	.l2	1, B4, B1
+||[!b1] b	.s2	B3	; RETURN A
+#ifdef _TMS320C6400
+||[!b1] mvk	.d2	1, B4
+#endif
+#ifdef L_udivmodsi4
+||[!b1] zero	.s1	A5
+#endif
+	mv	.l1x	B1, A6
+||	shl	.s2	B4, B1, B4
+
+	;; The loop performs a maximum of 28 steps, so we do the
+	;; first 3 here.
+	cmpltu	.l1x	A4, B4, A2
+[!A2]	sub	.l1x	A4, B4, A4
+||	shru	.s2	B4, 1, B4
+||	xor	.s1	1, A2, A2
+
+	shl	.s1	A2, 31, A2
+|| [b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+
+	;; RETURN A may happen here (note: must happen before the next branch)
+0:
+	cmpgt	.l2	B1, 7, B0
+|| [b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+|| [b0] b	.s1	0b
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+	;; loop backwards branch happens here
+
+	ret	.s2	B3
+||	mvk	.s1	32, A1
+	sub	.l1	A1, A6, A6
+#ifdef L_udivmodsi4
+||	extu	.s1	A4, A6, A5
+#endif
+	shl	.s1	A4, A6, A4
+	shru	.s1	A4, 1, A4
+||	sub	.l1	A6, 1, A6
+	or	.l1	A2, A4, A4
+	shru	.s1	A4, A6, A4
+	nop
+
+#endif
+
+#ifdef L_umodsi3
+.align 2
+.global __c6xabi_remu
+.hidden __c6xabi_remu
+.type __c6xabi_remu, STT_FUNC
+__c6xabi_remu:
+	;; The ABI seems designed to prevent these functions calling each other,
+	;; so we duplicate most of the divsi3 code here.
+	mv	.s2x	A4, B1
+#ifndef _TMS320C6400
+[!b1]	mvk	.s2	1, B4
+#endif
+	lmbd	.l2	1, B4, B1
+||[!b1] b	.s2	B3	; RETURN A
+#ifdef _TMS320C6400
+||[!b1] mvk	.d2	1, B4
+#endif
+
+	mv	.l1x	B1, A7
+||	shl	.s2	B4, B1, B4
+
+	cmpltu	.l1x	A4, B4, A1
+[!a1]	sub	.l1x	A4, B4, A4
+	shru	.s2	B4, 1, B4
+
+0:
+	cmpgt	.l2	B1, 7, B0
+|| [b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+	;; RETURN A may happen here (note: must happen before the next branch)
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+|| [b0] b	.s1	0b
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+	;; loop backwards branch happens here
+
+	ret	.s2	B3
+[b1]	subc	.l1x	A4,B4,A4
+|| [b1]	add	.s2	-1, B1, B1
+[b1]	subc	.l1x	A4,B4,A4
+
+	extu	.s1	A4, A7, A4
+	nop	2
+#endif
+
+#if defined L_strasgi_64plus && defined _TMS320C6400_PLUS
+
+.align 2
+.global __c6xabi_strasgi_64plus
+.hidden __c6xabi_strasgi_64plus
+.type __c6xabi_strasgi_64plus, STT_FUNC
+__c6xabi_strasgi_64plus:
+	shru	.s2x	a6, 2, b31
+||	mv	.s1	a4, a30
+||	mv	.d2	b4, b30
+
+	add	.s2	-4, b31, b31
+
+	sploopd		1
+||	mvc	.s2	b31, ilc
+	ldw	.d2t2	*b30++, b31
+	nop	4
+	mv	.s1x	b31,a31
+	spkernel	6, 0
+||	stw	.d1t1	a31, *a30++
+
+	ret	.s2	b3
+	nop 5
+#endif
+
+#ifdef L_strasgi
+.global __c6xabi_strasgi
+.type __c6xabi_strasgi, STT_FUNC
+__c6xabi_strasgi:
+	;; This is essentially memcpy, with alignment known to be at least
+	;; 4, and the size a multiple of 4 greater than or equal to 28.
+	ldw	.d2t1	*B4++, A0
+||	mvk	.s2	16, B1
+	ldw	.d2t1	*B4++, A1
+||	mvk	.s2	20, B2
+||	sub	.d1	A6, 24, A6
+	ldw	.d2t1	*B4++, A5
+	ldw	.d2t1	*B4++, A7
+||	mv	.l2x	A6, B7
+	ldw	.d2t1	*B4++, A8
+	ldw	.d2t1	*B4++, A9
+||	mv	.s2x	A0, B5
+||	cmpltu	.l2	B2, B7, B0
+
+0:
+	stw	.d1t2	B5, *A4++
+||[b0]	ldw	.d2t1	*B4++, A0
+||	mv	.s2x	A1, B5
+||	mv	.l2	B7, B6
+
+[b0]	sub	.d2	B6, 24, B7
+||[b0]	b	.s2	0b
+||	cmpltu	.l2	B1, B6, B0
+
+[b0]	ldw	.d2t1	*B4++, A1
+||	stw	.d1t2	B5, *A4++
+||	mv	.s2x	A5, B5
+||	cmpltu	.l2	12, B6, B0
+
+[b0]	ldw	.d2t1	*B4++, A5
+||	stw	.d1t2	B5, *A4++
+||	mv	.s2x	A7, B5
+||	cmpltu	.l2	8, B6, B0
+
+[b0]	ldw	.d2t1	*B4++, A7
+||	stw	.d1t2	B5, *A4++
+||	mv	.s2x	A8, B5
+||	cmpltu	.l2	4, B6, B0
+
+[b0]	ldw	.d2t1	*B4++, A8
+||	stw	.d1t2	B5, *A4++
+||	mv	.s2x	A9, B5
+||	cmpltu	.l2	0, B6, B0
+
+[b0]	ldw	.d2t1	*B4++, A9
+||	stw	.d1t2	B5, *A4++
+||	mv	.s2x	A0, B5
+||	cmpltu	.l2	B2, B7, B0
+
+	;; loop back branch happens here
+
+	cmpltu	.l2	B1, B6, B0
+||	ret	.s2	b3
+
+[b0]	stw	.d1t1	A1, *A4++
+||	cmpltu	.l2	12, B6, B0
+[b0]	stw	.d1t1	A5, *A4++
+||	cmpltu	.l2	8, B6, B0
+[b0]	stw	.d1t1	A7, *A4++
+||	cmpltu	.l2	4, B6, B0
+[b0]	stw	.d1t1	A8, *A4++
+||	cmpltu	.l2	0, B6, B0
+[b0]	stw	.d1t1	A9, *A4++
+
+	;; return happens here
+
+#endif
+
+#ifdef _TMS320C6400_PLUS
+#ifdef L_push_rts
+.align 2
+.global __c6xabi_push_rts
+.hidden __c6xabi_push_rts
+.type __c6xabi_push_rts, STT_FUNC
+__c6xabi_push_rts:
+	stw .d2t2	B14, *B15--[2]
+	stdw .d2t1	A15:A14, *B15--
+||	b .s2x		A3
+	stdw .d2t2	B13:B12, *B15--
+	stdw .d2t1	A13:A12, *B15--
+	stdw .d2t2	B11:B10, *B15--
+	stdw .d2t1	A11:A10, *B15--
+	stdw .d2t2	B3:B2, *B15--
+#endif
+
+#ifdef L_pop_rts
+.align 2
+.global __c6xabi_pop_rts
+.hidden __c6xabi_pop_rts
+.type __c6xabi_pop_rts, STT_FUNC
+__c6xabi_pop_rts:
+	lddw .d2t2	*++B15, B3:B2
+	lddw .d2t1	*++B15, A11:A10
+	lddw .d2t2	*++B15, B11:B10
+	lddw .d2t1	*++B15, A13:A12
+	lddw .d2t2	*++B15, B13:B12
+	lddw .d2t1	*++B15, A15:A14
+||	b .s2		B3
+	ldw .d2t2	*++B15[2], B14
+	nop		4
+#endif
+
+#ifdef L_call_stub
+.align 2
+.global __c6xabi_call_stub
+.type __c6xabi_call_stub, STT_FUNC
+__c6xabi_call_stub:
+	stw .d2t1	A2, *B15--[2]
+	stdw .d2t1	A7:A6, *B15--
+||	call .s2	B31
+	stdw .d2t1	A1:A0, *B15--
+	stdw .d2t2	B7:B6, *B15--
+	stdw .d2t2	B5:B4, *B15--
+	stdw .d2t2	B1:B0, *B15--
+	stdw .d2t2	B3:B2, *B15--
+||	addkpc .s2	1f, B3, 0
+1:
+	lddw .d2t2	*++B15, B3:B2
+	lddw .d2t2	*++B15, B1:B0
+	lddw .d2t2	*++B15, B5:B4
+	lddw .d2t2	*++B15, B7:B6
+	lddw .d2t1	*++B15, A1:A0
+	lddw .d2t1	*++B15, A7:A6
+||	b .s2		B3
+	ldw .d2t1	*++B15[2], A2
+	nop		4
+#endif
+
+#endif
+
diff --git a/libgcc/config/c6x/t-elf b/libgcc/config/c6x/t-elf
index 99d0cd2d5ca..e01c4109e52 100644
--- a/libgcc/config/c6x/t-elf
+++ b/libgcc/config/c6x/t-elf
@@ -1,6 +1,11 @@
 # Cannot use default rules due to $(CRTSTUFF_T_CFLAGS).
 CUSTOM_CRTIN = yes
 
+LIB1ASMSRC = c6x/lib1funcs.S
+LIB1ASMFUNCS = _divsi3 _udivsi3 _umodsi3 _modsi3 _udivmodsi4 _divmodsi4
+LIB1ASMFUNCS += _strasgi _strasgi_64plus _clzsi2 _clzdi2 _clz
+LIB1ASMFUNCS += _push_rts _pop_rts _call_stub
+
 # Assemble startup files.
 crti.o: $(srcdir)/config/c6x/crti.S
 	$(crt_compile) -c $(CRTSTUFF_T_CFLAGS) $<
diff --git a/libgcc/config/fr30/lib1funcs.S b/libgcc/config/fr30/lib1funcs.S
new file mode 100644
index 00000000000..7c63453123a
--- /dev/null
+++ b/libgcc/config/fr30/lib1funcs.S
@@ -0,0 +1,115 @@
+/* libgcc routines for the FR30.
+   Copyright (C) 1998, 1999, 2009 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+	.macro FUNC_START name
+	.text
+	.globl __\name
+	.type  __\name, @function
+__\name:
+	.endm
+
+	.macro FUNC_END name
+	.size  __\name, . - __\name
+	.endm
+
+	.macro DIV_BODY reg number
+	.if \number
+	DIV_BODY  \reg, "\number - 1"
+	div1	\reg
+	.endif
+	.endm
+	
+#ifdef L_udivsi3
+FUNC_START udivsi3
+	;; Perform an unsiged division of r4 / r5 and place the result in r4.
+	;; Does not handle overflow yet...
+	mov	r4, mdl
+	div0u	r5
+	DIV_BODY r5 32
+	mov	mdl, r4
+	ret
+FUNC_END udivsi3
+#endif /* L_udivsi3 */
+
+#ifdef L_divsi3
+FUNC_START divsi3
+	;; Perform a siged division of r4 / r5 and place the result in r4.
+	;; Does not handle overflow yet...
+	mov	r4, mdl
+	div0s	r5
+	DIV_BODY r5 32
+	div2    r5
+	div3
+	div4s
+	mov	mdl, r4
+	ret
+FUNC_END divsi3
+#endif /* L_divsi3 */
+
+#ifdef L_umodsi3
+FUNC_START umodsi3
+	;; Perform an unsiged division of r4 / r5 and places the remainder in r4.
+	;; Does not handle overflow yet...
+	mov	r4, mdl
+	div0u	r5
+	DIV_BODY r5 32
+	mov	mdh, r4
+	ret
+FUNC_END umodsi3
+#endif /* L_umodsi3 */
+
+#ifdef L_modsi3
+FUNC_START modsi3
+	;; Perform a siged division of r4 / r5 and place the remainder in r4.
+	;; Does not handle overflow yet...
+	mov	r4, mdl
+	div0s	r5
+	DIV_BODY r5 32
+	div2    r5
+	div3
+	div4s
+	mov	mdh, r4
+	ret
+FUNC_END modsi3
+#endif /* L_modsi3 */
+
+#ifdef L_negsi2
+FUNC_START negsi2
+	ldi:8	#0, r0
+	sub	r4, r0
+	mov	r0, r4
+	ret
+FUNC_END negsi2
+#endif /* L_negsi2 */
+
+#ifdef L_one_cmplsi2
+FUNC_START one_cmplsi2
+	ldi:8	#0xff, r0
+	extsb	r0
+	eor	r0, r4
+	ret
+FUNC_END one_cmplsi2
+#endif /* L_one_cmplsi2 */
+
+
diff --git a/libgcc/config/fr30/t-fr30 b/libgcc/config/fr30/t-fr30
new file mode 100644
index 00000000000..ee5ed9a127e
--- /dev/null
+++ b/libgcc/config/fr30/t-fr30
@@ -0,0 +1,2 @@
+LIB1ASMSRC    = fr30/lib1funcs.S
+LIB1ASMFUNCS  = _udivsi3 _divsi3 _umodsi3 _modsi3
diff --git a/libgcc/config/frv/lib1funcs.S b/libgcc/config/frv/lib1funcs.S
new file mode 100644
index 00000000000..d1ffcab6133
--- /dev/null
+++ b/libgcc/config/frv/lib1funcs.S
@@ -0,0 +1,269 @@
+/* Library functions.
+   Copyright (C) 2000, 2003, 2008, 2009 Free Software Foundation, Inc.
+   Contributed by Red Hat, Inc.
+  
+   This file is part of GCC.
+  
+   GCC is free software ; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+  
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY ; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+  
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <frv-asm.h>
+
+
+#ifdef L_cmpll
+/* icc0 = __cmpll (long long a, long long b)  */
+
+	.globl	EXT(__cmpll)
+	.type	EXT(__cmpll),@function
+	.text
+	.p2align 4
+EXT(__cmpll):
+	cmp	gr8, gr10, icc0
+	ckeq	icc0, cc4
+	P(ccmp)	gr9, gr11, cc4, 1
+	ret
+.Lend:
+	.size	EXT(__cmpll),.Lend-EXT(__cmpll)
+#endif /* L_cmpll */
+
+#ifdef L_cmpf
+/* icc0 = __cmpf (float a, float b) */
+/* Note, because this function returns the result in ICC0, it means it can't
+   handle NaNs.  */
+
+	.globl	EXT(__cmpf)
+	.type	EXT(__cmpf),@function
+	.text
+	.p2align 4
+EXT(__cmpf):
+#ifdef __FRV_HARD_FLOAT__	/* floating point instructions available */
+	movgf	gr8, fr0
+	P(movgf) gr9, fr1
+	setlos	#1, gr8
+	fcmps	fr0, fr1, fcc0
+	P(fcklt) fcc0, cc0
+	fckeq	fcc0, cc1
+	csub	gr0, gr8, gr8, cc0, 1
+	cmov	gr0, gr8, cc1, 1
+	cmpi	gr8, 0, icc0
+	ret
+#else				/* no floating point instructions available */
+	movsg	lr, gr4
+	addi	sp, #-16, sp
+	sti	gr4, @(sp, 8)
+	st	fp, @(sp, gr0)
+	mov	sp, fp
+	call	EXT(__cmpsf2)
+	cmpi	gr8, #0, icc0
+	ldi	@(sp, 8), gr4
+	movgs	gr4, lr
+	ld	@(sp,gr0), fp
+	addi	sp, #16, sp
+	ret
+#endif
+.Lend:
+	.size	EXT(__cmpf),.Lend-EXT(__cmpf)
+#endif
+
+#ifdef L_cmpd
+/* icc0 = __cmpd (double a, double b) */
+/* Note, because this function returns the result in ICC0, it means it can't
+   handle NaNs.  */
+
+	.globl	EXT(__cmpd)
+	.type	EXT(__cmpd),@function
+	.text
+	.p2align 4
+EXT(__cmpd):
+	movsg	lr, gr4
+	addi	sp, #-16, sp
+	sti	gr4, @(sp, 8)
+	st	fp, @(sp, gr0)
+	mov	sp, fp
+	call	EXT(__cmpdf2)
+	cmpi	gr8, #0, icc0
+	ldi	@(sp, 8), gr4
+	movgs	gr4, lr
+	ld	@(sp,gr0), fp
+	addi	sp, #16, sp
+	ret
+.Lend:
+	.size	EXT(__cmpd),.Lend-EXT(__cmpd)
+#endif
+
+#ifdef L_addll
+/* gr8,gr9 = __addll (long long a, long long b) */
+/* Note, gcc will never call this function, but it is present in case an
+   ABI program calls it.  */
+
+	.globl	EXT(__addll)
+	.type	EXT(__addll),@function
+	.text
+	.p2align
+EXT(__addll):
+	addcc	gr9, gr11, gr9, icc0
+	addx	gr8, gr10, gr8, icc0
+	ret
+.Lend:
+	.size	EXT(__addll),.Lend-EXT(__addll)
+#endif
+
+#ifdef L_subll
+/* gr8,gr9 = __subll (long long a, long long b) */
+/* Note, gcc will never call this function, but it is present in case an
+   ABI program calls it.  */
+
+	.globl	EXT(__subll)
+	.type	EXT(__subll),@function
+	.text
+	.p2align 4
+EXT(__subll):
+	subcc	gr9, gr11, gr9, icc0
+	subx	gr8, gr10, gr8, icc0
+	ret
+.Lend:
+	.size	EXT(__subll),.Lend-EXT(__subll)
+#endif
+
+#ifdef L_andll
+/* gr8,gr9 = __andll (long long a, long long b) */
+/* Note, gcc will never call this function, but it is present in case an
+   ABI program calls it.  */
+
+	.globl	EXT(__andll)
+	.type	EXT(__andll),@function
+	.text
+	.p2align 4
+EXT(__andll):
+	P(and)	gr9, gr11, gr9
+	P2(and)	gr8, gr10, gr8
+	ret
+.Lend:
+	.size	EXT(__andll),.Lend-EXT(__andll)
+#endif
+
+#ifdef L_orll
+/* gr8,gr9 = __orll (long long a, long long b) */
+/* Note, gcc will never call this function, but it is present in case an
+   ABI program calls it.  */
+
+	.globl	EXT(__orll)
+	.type	EXT(__orll),@function
+	.text
+	.p2align 4
+EXT(__orll):
+	P(or)	gr9, gr11, gr9
+	P2(or)	gr8, gr10, gr8
+	ret
+.Lend:
+	.size	EXT(__orll),.Lend-EXT(__orll)
+#endif
+
+#ifdef L_xorll
+/* gr8,gr9 = __xorll (long long a, long long b) */
+/* Note, gcc will never call this function, but it is present in case an
+   ABI program calls it.  */
+
+	.globl	EXT(__xorll)
+	.type	EXT(__xorll),@function
+	.text
+	.p2align 4
+EXT(__xorll):
+	P(xor)	gr9, gr11, gr9
+	P2(xor)	gr8, gr10, gr8
+	ret
+.Lend:
+	.size	EXT(__xorll),.Lend-EXT(__xorll)
+#endif
+
+#ifdef L_notll
+/* gr8,gr9 = __notll (long long a) */
+/* Note, gcc will never call this function, but it is present in case an
+   ABI program calls it.  */
+
+	.globl	EXT(__notll)
+	.type	EXT(__notll),@function
+	.text
+	.p2align 4
+EXT(__notll):
+	P(not)	gr9, gr9
+	P2(not)	gr8, gr8
+	ret
+.Lend:
+	.size	EXT(__notll),.Lend-EXT(__notll)
+#endif
+
+#ifdef L_cmov
+/* (void) __cmov (char *dest, const char *src, size_t len) */
+/*
+ * void __cmov (char *dest, const char *src, size_t len)
+ * {
+ *   size_t i;
+ * 
+ *   if (dest < src || dest > src+len)
+ *     {
+ *	 for (i = 0; i < len; i++)
+ *	 dest[i] = src[i];
+ *     }
+ *   else
+ *     {
+ *	 while (len-- > 0)
+ *	 dest[len] = src[len];
+ *     }
+ * }
+ */
+
+	.globl	EXT(__cmov)
+	.type	EXT(__cmov),@function
+	.text
+	.p2align 4
+EXT(__cmov):
+	P(cmp)	gr8, gr9, icc0
+	add	gr9, gr10, gr4
+	P(cmp)	gr8, gr4, icc1
+	bc	icc0, 0, .Lfwd
+	bls	icc1, 0, .Lback
+.Lfwd:
+	/* move bytes in a forward direction */
+	P(setlos) #0, gr5
+	cmp	gr0, gr10, icc0
+	P(subi)	gr9, #1, gr9
+	P2(subi) gr8, #1, gr8
+	bnc	icc0, 0, .Lret
+.Lfloop:
+	/* forward byte move loop */
+	addi	gr5, #1, gr5
+	P(ldsb)	@(gr9, gr5), gr4
+	cmp	gr5, gr10, icc0
+	P(stb)	gr4, @(gr8, gr5)
+	bc	icc0, 0, .Lfloop
+	ret
+.Lbloop:
+	/* backward byte move loop body */
+	ldsb	@(gr9,gr10),gr4
+	stb	gr4,@(gr8,gr10)
+.Lback:
+	P(cmpi)	gr10, #0, icc0
+	addi	gr10, #-1, gr10
+	bne	icc0, 0, .Lbloop
+.Lret:
+	ret
+.Lend:
+	.size	 EXT(__cmov),.Lend-EXT(__cmov)
+#endif
diff --git a/libgcc/config/frv/t-frv b/libgcc/config/frv/t-frv
index b364a5a25b9..9773722d8e7 100644
--- a/libgcc/config/frv/t-frv
+++ b/libgcc/config/frv/t-frv
@@ -1,3 +1,6 @@
+LIB1ASMSRC	= frv/lib1funcs.S
+LIB1ASMFUNCS	= _cmpll _cmpf _cmpd _addll _subll _andll _orll _xorll _notll _cmov
+
 # Compile two additional files that are linked with every program
 # linked using GCC on systems using COFF or ELF, for the sake of C++
 # constructors.
diff --git a/libgcc/config/h8300/lib1funcs.S b/libgcc/config/h8300/lib1funcs.S
new file mode 100644
index 00000000000..1b75b73269d
--- /dev/null
+++ b/libgcc/config/h8300/lib1funcs.S
@@ -0,0 +1,838 @@
+;; libgcc routines for the Renesas H8/300 CPU.
+;; Contributed by Steve Chamberlain <sac@cygnus.com>
+;; Optimizations by Toshiyasu Morita <toshiyasu.morita@renesas.com>
+
+/* Copyright (C) 1994, 2000, 2001, 2002, 2003, 2004, 2009
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Assembler register definitions.  */
+
+#define A0 r0
+#define A0L r0l
+#define A0H r0h
+
+#define A1 r1
+#define A1L r1l
+#define A1H r1h
+
+#define A2 r2
+#define A2L r2l
+#define A2H r2h
+
+#define A3 r3
+#define A3L r3l
+#define A3H r3h
+
+#define S0 r4
+#define S0L r4l
+#define S0H r4h
+
+#define S1 r5
+#define S1L r5l
+#define S1H r5h
+
+#define S2 r6
+#define S2L r6l
+#define S2H r6h
+
+#ifdef __H8300__
+#define PUSHP	push
+#define POPP	pop
+
+#define A0P	r0
+#define A1P	r1
+#define A2P	r2
+#define A3P	r3
+#define S0P	r4
+#define S1P	r5
+#define S2P	r6
+#endif
+
+#if defined (__H8300H__) || defined (__H8300S__) || defined (__H8300SX__)
+#define PUSHP	push.l
+#define POPP	pop.l
+
+#define A0P	er0
+#define A1P	er1
+#define A2P	er2
+#define A3P	er3
+#define S0P	er4
+#define S1P	er5
+#define S2P	er6
+
+#define A0E	e0
+#define A1E	e1
+#define A2E	e2
+#define A3E	e3
+#endif
+
+#ifdef __H8300H__
+#ifdef __NORMAL_MODE__
+	.h8300hn
+#else
+	.h8300h
+#endif
+#endif
+
+#ifdef __H8300S__
+#ifdef __NORMAL_MODE__
+	.h8300sn
+#else
+	.h8300s
+#endif
+#endif
+#ifdef __H8300SX__
+#ifdef __NORMAL_MODE__
+	.h8300sxn
+#else
+	.h8300sx
+#endif
+#endif
+
+#ifdef L_cmpsi2
+#ifdef __H8300__
+	.section .text
+	.align 2
+	.global ___cmpsi2
+___cmpsi2:
+	cmp.w	A0,A2
+	bne	.L2
+	cmp.w	A1,A3
+	bne	.L4
+	mov.w	#1,A0
+	rts
+.L2:
+	bgt	.L5
+.L3:
+	mov.w	#2,A0
+	rts
+.L4:
+	bls	.L3
+.L5:
+	sub.w	A0,A0
+	rts
+	.end
+#endif
+#endif /* L_cmpsi2 */
+
+#ifdef L_ucmpsi2
+#ifdef __H8300__
+	.section .text
+	.align 2
+	.global ___ucmpsi2
+___ucmpsi2:
+	cmp.w	A0,A2
+	bne	.L2
+	cmp.w	A1,A3
+	bne	.L4
+	mov.w	#1,A0
+	rts
+.L2:
+	bhi	.L5
+.L3:
+	mov.w	#2,A0
+	rts
+.L4:
+	bls	.L3
+.L5:
+	sub.w	A0,A0
+	rts
+	.end
+#endif
+#endif /* L_ucmpsi2 */
+
+#ifdef L_divhi3
+
+;; HImode divides for the H8/300.
+;; We bunch all of this into one object file since there are several
+;; "supporting routines".
+
+; general purpose normalize routine
+;
+; divisor in A0
+; dividend in A1
+; turns both into +ve numbers, and leaves what the answer sign
+; should be in A2L
+
+#ifdef __H8300__
+	.section .text
+	.align 2
+divnorm:
+	or	A0H,A0H		; is divisor > 0
+	stc	ccr,A2L
+	bge	_lab1
+	not	A0H		; no - then make it +ve
+	not	A0L
+	adds	#1,A0
+_lab1:	or	A1H,A1H	; look at dividend
+	bge	_lab2
+	not	A1H		; it is -ve, make it positive
+	not	A1L
+	adds	#1,A1
+	xor	#0x8,A2L; and toggle sign of result
+_lab2:	rts
+;; Basically the same, except that the sign of the divisor determines
+;; the sign.
+modnorm:
+	or	A0H,A0H		; is divisor > 0
+	stc	ccr,A2L
+	bge	_lab7
+	not	A0H		; no - then make it +ve
+	not	A0L
+	adds	#1,A0
+_lab7:	or	A1H,A1H	; look at dividend
+	bge	_lab8
+	not	A1H		; it is -ve, make it positive
+	not	A1L
+	adds	#1,A1
+_lab8:	rts
+
+; A0=A0/A1 signed
+
+	.global	___divhi3
+___divhi3:
+	bsr	divnorm
+	bsr	___udivhi3
+negans:	btst	#3,A2L	; should answer be negative ?
+	beq	_lab4
+	not	A0H	; yes, so make it so
+	not	A0L
+	adds	#1,A0
+_lab4:	rts
+
+; A0=A0%A1 signed
+
+	.global	___modhi3
+___modhi3:
+	bsr	modnorm
+	bsr	___udivhi3
+	mov	A3,A0
+	bra	negans
+
+; A0=A0%A1 unsigned
+
+	.global	___umodhi3
+___umodhi3:
+	bsr	___udivhi3
+	mov	A3,A0
+	rts
+
+; A0=A0/A1 unsigned
+; A3=A0%A1 unsigned
+; A2H trashed
+; D high 8 bits of denom
+; d low 8 bits of denom
+; N high 8 bits of num
+; n low 8 bits of num
+; M high 8 bits of mod
+; m low 8 bits of mod
+; Q high 8 bits of quot
+; q low 8 bits of quot
+; P preserve
+
+; The H8/300 only has a 16/8 bit divide, so we look at the incoming and
+; see how to partition up the expression.
+
+	.global	___udivhi3
+___udivhi3:
+				; A0 A1 A2 A3
+				; Nn Dd       P
+	sub.w	A3,A3		; Nn Dd xP 00
+	or	A1H,A1H
+	bne	divlongway
+	or	A0H,A0H
+	beq	_lab6
+
+; we know that D == 0 and N is != 0
+	mov.b	A0H,A3L		; Nn Dd xP 0N
+	divxu	A1L,A3		;          MQ
+	mov.b	A3L,A0H	 	; Q
+; dealt with N, do n
+_lab6:	mov.b	A0L,A3L		;           n
+	divxu	A1L,A3		;          mq
+	mov.b	A3L,A0L		; Qq
+	mov.b	A3H,A3L         ;           m
+	mov.b	#0x0,A3H	; Qq       0m
+	rts
+
+; D != 0 - which means the denominator is
+;          loop around to get the result.
+
+divlongway:
+	mov.b	A0H,A3L		; Nn Dd xP 0N
+	mov.b	#0x0,A0H	; high byte of answer has to be zero
+	mov.b	#0x8,A2H	;       8
+div8:	add.b	A0L,A0L		; n*=2
+	rotxl	A3L		; Make remainder bigger
+	rotxl	A3H
+	sub.w	A1,A3		; Q-=N
+	bhs	setbit		; set a bit ?
+	add.w	A1,A3		;  no : too far , Q+=N
+
+	dec	A2H
+	bne	div8		; next bit
+	rts
+
+setbit:	inc	A0L		; do insert bit
+	dec	A2H
+	bne	div8		; next bit
+	rts
+
+#endif /* __H8300__ */
+#endif /* L_divhi3 */
+
+#ifdef L_divsi3
+
+;; 4 byte integer divides for the H8/300.
+;;
+;; We have one routine which does all the work and lots of
+;; little ones which prepare the args and massage the sign.
+;; We bunch all of this into one object file since there are several
+;; "supporting routines".
+
+	.section .text
+	.align 2
+
+; Put abs SIs into r0/r1 and r2/r3, and leave a 1 in r6l with sign of rest.
+; This function is here to keep branch displacements small.
+
+#ifdef __H8300__
+
+divnorm:
+	mov.b	A0H,A0H		; is the numerator -ve
+	stc	ccr,S2L		; keep the sign in bit 3 of S2L
+	bge	postive
+
+	; negate arg
+	not	A0H
+	not	A1H
+	not	A0L
+	not	A1L
+
+	add	#1,A1L
+	addx	#0,A1H
+	addx	#0,A0L
+	addx	#0,A0H
+postive:
+	mov.b	A2H,A2H		; is the denominator -ve
+	bge	postive2
+	not	A2L
+	not	A2H
+	not	A3L
+	not	A3H
+	add.b	#1,A3L
+	addx	#0,A3H
+	addx	#0,A2L
+	addx	#0,A2H
+	xor.b	#0x08,S2L	; toggle the result sign
+postive2:
+	rts
+
+;; Basically the same, except that the sign of the divisor determines
+;; the sign.
+modnorm:
+	mov.b	A0H,A0H		; is the numerator -ve
+	stc	ccr,S2L		; keep the sign in bit 3 of S2L
+	bge	mpostive
+
+	; negate arg
+	not	A0H
+	not	A1H
+	not	A0L
+	not	A1L
+
+	add	#1,A1L
+	addx	#0,A1H
+	addx	#0,A0L
+	addx	#0,A0H
+mpostive:
+	mov.b	A2H,A2H		; is the denominator -ve
+	bge	mpostive2
+	not	A2L
+	not	A2H
+	not	A3L
+	not	A3H
+	add.b	#1,A3L
+	addx	#0,A3H
+	addx	#0,A2L
+	addx	#0,A2H
+mpostive2:
+	rts
+
+#else /* __H8300H__ */
+
+divnorm:
+	mov.l	A0P,A0P		; is the numerator -ve
+	stc	ccr,S2L		; keep the sign in bit 3 of S2L
+	bge	postive
+
+	neg.l	A0P		; negate arg
+
+postive:
+	mov.l	A1P,A1P		; is the denominator -ve
+	bge	postive2
+
+	neg.l	A1P		; negate arg
+	xor.b	#0x08,S2L	; toggle the result sign
+
+postive2:
+	rts
+
+;; Basically the same, except that the sign of the divisor determines
+;; the sign.
+modnorm:
+	mov.l	A0P,A0P		; is the numerator -ve
+	stc	ccr,S2L		; keep the sign in bit 3 of S2L
+	bge	mpostive
+
+	neg.l	A0P		; negate arg
+
+mpostive:
+	mov.l	A1P,A1P		; is the denominator -ve
+	bge	mpostive2
+
+	neg.l	A1P		; negate arg
+
+mpostive2:
+	rts
+
+#endif
+
+; numerator in A0/A1
+; denominator in A2/A3
+	.global	___modsi3
+___modsi3:
+#ifdef __H8300__
+	PUSHP	S2P
+	PUSHP	S0P
+	PUSHP	S1P
+	bsr	modnorm
+	bsr	divmodsi4
+	mov	S0,A0
+	mov	S1,A1
+	bra	exitdiv
+#else
+	PUSHP	S2P
+	bsr	modnorm
+	bsr	___udivsi3
+	mov.l	er3,er0
+	bra	exitdiv
+#endif
+
+	;; H8/300H and H8S version of ___udivsi3 is defined later in
+	;; the file.
+#ifdef __H8300__
+	.global	___udivsi3
+___udivsi3:
+	PUSHP	S2P
+	PUSHP	S0P
+	PUSHP	S1P
+	bsr	divmodsi4
+	bra	reti
+#endif
+
+	.global	___umodsi3
+___umodsi3:
+#ifdef __H8300__
+	PUSHP	S2P
+	PUSHP	S0P
+	PUSHP	S1P
+	bsr	divmodsi4
+	mov	S0,A0
+	mov	S1,A1
+	bra	reti
+#else
+	bsr	___udivsi3
+	mov.l	er3,er0
+	rts
+#endif
+
+	.global	___divsi3
+___divsi3:
+#ifdef __H8300__
+	PUSHP	S2P
+	PUSHP	S0P
+	PUSHP	S1P
+	jsr	divnorm
+	jsr	divmodsi4
+#else
+	PUSHP	S2P
+	jsr	divnorm
+	bsr	___udivsi3
+#endif
+
+	; examine what the sign should be
+exitdiv:
+	btst	#3,S2L
+	beq	reti
+
+	; should be -ve
+#ifdef __H8300__
+	not	A0H
+	not	A1H
+	not	A0L
+	not	A1L
+
+	add	#1,A1L
+	addx	#0,A1H
+	addx	#0,A0L
+	addx	#0,A0H
+#else /* __H8300H__ */
+	neg.l	A0P
+#endif
+
+reti:
+#ifdef __H8300__
+	POPP	S1P
+	POPP	S0P
+#endif
+	POPP	S2P
+	rts
+
+	; takes A0/A1 numerator (A0P for H8/300H)
+	; A2/A3 denominator (A1P for H8/300H)
+	; returns A0/A1 quotient (A0P for H8/300H)
+	; S0/S1 remainder (S0P for H8/300H)
+	; trashes S2H
+
+#ifdef __H8300__
+
+divmodsi4:
+        sub.w	S0,S0		; zero play area
+        mov.w	S0,S1
+        mov.b	A2H,S2H
+        or	A2L,S2H
+        or	A3H,S2H
+        bne	DenHighNonZero
+        mov.b	A0H,A0H
+        bne	NumByte0Zero
+        mov.b	A0L,A0L
+        bne	NumByte1Zero
+        mov.b	A1H,A1H
+        bne	NumByte2Zero
+        bra	NumByte3Zero
+NumByte0Zero:
+	mov.b	A0H,S1L
+        divxu	A3L,S1
+        mov.b	S1L,A0H
+NumByte1Zero:
+	mov.b	A0L,S1L
+        divxu	A3L,S1
+        mov.b	S1L,A0L
+NumByte2Zero:
+	mov.b	A1H,S1L
+        divxu	A3L,S1
+        mov.b	S1L,A1H
+NumByte3Zero:
+	mov.b	A1L,S1L
+        divxu	A3L,S1
+        mov.b	S1L,A1L
+
+        mov.b	S1H,S1L
+        mov.b	#0x0,S1H
+        rts
+
+; have to do the divide by shift and test
+DenHighNonZero:
+	mov.b	A0H,S1L
+        mov.b	A0L,A0H
+        mov.b	A1H,A0L
+        mov.b	A1L,A1H
+
+        mov.b	#0,A1L
+        mov.b	#24,S2H	; only do 24 iterations
+
+nextbit:
+	add.w	A1,A1	; double the answer guess
+        rotxl	A0L
+        rotxl	A0H
+
+        rotxl	S1L	; double remainder
+        rotxl	S1H
+        rotxl	S0L
+        rotxl	S0H
+        sub.w	A3,S1	; does it all fit
+        subx	A2L,S0L
+        subx	A2H,S0H
+        bhs	setone
+
+        add.w	A3,S1	; no, restore mistake
+        addx	A2L,S0L
+        addx	A2H,S0H
+
+        dec	S2H
+        bne	nextbit
+        rts
+
+setone:
+	inc	A1L
+        dec	S2H
+        bne	nextbit
+        rts
+
+#else /* __H8300H__ */
+
+	;; This function also computes the remainder and stores it in er3.
+	.global	___udivsi3
+___udivsi3:
+	mov.w	A1E,A1E		; denominator top word 0?
+	bne	DenHighNonZero
+
+	; do it the easy way, see page 107 in manual
+	mov.w	A0E,A2
+	extu.l	A2P
+	divxu.w	A1,A2P
+	mov.w	A2E,A0E
+	divxu.w	A1,A0P
+	mov.w	A0E,A3
+	mov.w	A2,A0E
+	extu.l	A3P
+	rts
+
+ 	; er0 = er0 / er1
+ 	; er3 = er0 % er1
+ 	; trashes er1 er2
+ 	; expects er1 >= 2^16
+DenHighNonZero:
+	mov.l	er0,er3
+	mov.l	er1,er2
+#ifdef __H8300H__
+divmod_L21:
+	shlr.l	er0
+	shlr.l	er2		; make divisor < 2^16
+	mov.w	e2,e2
+	bne	divmod_L21
+#else
+	shlr.l	#2,er2		; make divisor < 2^16
+	mov.w	e2,e2
+	beq	divmod_L22A
+divmod_L21:
+	shlr.l	#2,er0
+divmod_L22:
+	shlr.l	#2,er2		; make divisor < 2^16
+	mov.w	e2,e2
+	bne	divmod_L21
+divmod_L22A:
+	rotxl.w	r2
+	bcs	divmod_L23
+	shlr.l	er0
+	bra	divmod_L24
+divmod_L23:
+	rotxr.w	r2
+	shlr.l	#2,er0
+divmod_L24:
+#endif
+	;; At this point,
+	;;  er0 contains shifted dividend
+	;;  er1 contains divisor
+	;;  er2 contains shifted divisor
+	;;  er3 contains dividend, later remainder
+	divxu.w	r2,er0		; r0 now contains the approximate quotient (AQ)
+	extu.l	er0
+	beq	divmod_L25
+	subs	#1,er0		; er0 = AQ - 1
+	mov.w	e1,r2
+	mulxu.w	r0,er2		; er2 = upper (AQ - 1) * divisor
+	sub.w	r2,e3		; dividend - 65536 * er2
+	mov.w	r1,r2
+	mulxu.w	r0,er2		; compute er3 = remainder (tentative)
+	sub.l	er2,er3		; er3 = dividend - (AQ - 1) * divisor
+divmod_L25:
+ 	cmp.l	er1,er3		; is divisor < remainder?
+	blo	divmod_L26
+ 	adds	#1,er0
+	sub.l	er1,er3		; correct the remainder
+divmod_L26:
+	rts
+
+#endif
+#endif /* L_divsi3 */
+
+#ifdef L_mulhi3
+
+;; HImode multiply.
+; The H8/300 only has an 8*8->16 multiply.
+; The answer is the same as:
+;
+; product = (srca.l * srcb.l) + ((srca.h * srcb.l) + (srcb.h * srca.l)) * 256
+; (we can ignore A1.h * A0.h cause that will all off the top)
+; A0 in
+; A1 in
+; A0 answer
+
+#ifdef __H8300__
+	.section .text
+	.align 2
+	.global	___mulhi3
+___mulhi3:
+	mov.b	A1L,A2L		; A2l gets srcb.l
+	mulxu	A0L,A2		; A2 gets first sub product
+
+	mov.b	A0H,A3L		; prepare for
+	mulxu	A1L,A3		; second sub product
+
+	add.b	A3L,A2H		; sum first two terms
+
+	mov.b	A1H,A3L		; third sub product
+	mulxu	A0L,A3
+
+	add.b	A3L,A2H		; almost there
+	mov.w	A2,A0		; that is
+	rts
+
+#endif
+#endif /* L_mulhi3 */
+
+#ifdef L_mulsi3
+
+;; SImode multiply.
+;;
+;; I think that shift and add may be sufficient for this.  Using the
+;; supplied 8x8->16 would need 10 ops of 14 cycles each + overhead.  This way
+;; the inner loop uses maybe 20 cycles + overhead, but terminates
+;; quickly on small args.
+;;
+;; A0/A1 src_a
+;; A2/A3 src_b
+;;
+;;  while (a)
+;;    {
+;;      if (a & 1)
+;;        r += b;
+;;      a >>= 1;
+;;      b <<= 1;
+;;    }
+
+	.section .text
+	.align 2
+
+#ifdef __H8300__
+
+	.global	___mulsi3
+___mulsi3:
+	PUSHP	S0P
+	PUSHP	S1P
+
+	sub.w	S0,S0
+	sub.w	S1,S1
+
+	; while (a)
+_top:	mov.w	A0,A0
+	bne	_more
+	mov.w	A1,A1
+	beq	_done
+_more:	; if (a & 1)
+	bld	#0,A1L
+	bcc	_nobit
+	; r += b
+	add.w	A3,S1
+	addx	A2L,S0L
+	addx	A2H,S0H
+_nobit:
+	; a >>= 1
+	shlr	A0H
+	rotxr	A0L
+	rotxr	A1H
+	rotxr	A1L
+
+	; b <<= 1
+	add.w	A3,A3
+	addx	A2L,A2L
+	addx	A2H,A2H
+	bra 	_top
+
+_done:
+	mov.w	S0,A0
+	mov.w	S1,A1
+	POPP	S1P
+	POPP	S0P
+	rts
+
+#else /* __H8300H__ */
+
+;
+; mulsi3 for H8/300H - based on Renesas SH implementation
+;
+; by Toshiyasu Morita
+;
+; Old code:
+;
+; 16b * 16b = 372 states (worst case)
+; 32b * 32b = 724 states (worst case)
+;
+; New code:
+;
+; 16b * 16b =  48 states
+; 16b * 32b =  72 states
+; 32b * 32b =  92 states
+;
+
+	.global	___mulsi3
+___mulsi3:
+	mov.w	r1,r2   ; ( 2 states) b * d
+	mulxu	r0,er2  ; (22 states)
+
+	mov.w	e0,r3   ; ( 2 states) a * d
+	beq	L_skip1 ; ( 4 states)
+	mulxu	r1,er3  ; (22 states)
+	add.w	r3,e2   ; ( 2 states)
+
+L_skip1:
+	mov.w	e1,r3   ; ( 2 states) c * b
+	beq	L_skip2 ; ( 4 states)
+	mulxu	r0,er3  ; (22 states)
+	add.w	r3,e2   ; ( 2 states)
+
+L_skip2:
+	mov.l	er2,er0	; ( 2 states)
+	rts		; (10 states)
+
+#endif
+#endif /* L_mulsi3 */
+#ifdef L_fixunssfsi_asm
+/* For the h8300 we use asm to save some bytes, to
+   allow more programs to fit into the tiny address
+   space.  For the H8/300H and H8S, the C version is good enough.  */
+#ifdef __H8300__
+/* We still treat NANs different than libgcc2.c, but then, the
+   behavior is undefined anyways.  */
+	.global	___fixunssfsi
+___fixunssfsi:
+	cmp.b #0x4f,r0h
+	bge Large_num
+	jmp     @___fixsfsi
+Large_num:
+	bhi L_huge_num
+	xor.b #0x80,A0L
+	bmi L_shift8
+L_huge_num:
+	mov.w #65535,A0
+	mov.w A0,A1
+	rts
+L_shift8:
+	mov.b A0L,A0H
+	mov.b A1H,A0L
+	mov.b A1L,A1H
+	mov.b #0,A1L
+	rts
+#endif
+#endif /* L_fixunssfsi_asm */
diff --git a/libgcc/config/h8300/t-h8300 b/libgcc/config/h8300/t-h8300
new file mode 100644
index 00000000000..4602ff8b9ef
--- /dev/null
+++ b/libgcc/config/h8300/t-h8300
@@ -0,0 +1,3 @@
+LIB1ASMSRC = h8300/lib1funcs.S
+LIB1ASMFUNCS = _cmpsi2 _ucmpsi2 _divhi3 _divsi3 _mulhi3 _mulsi3 \
+  _fixunssfsi_asm
diff --git a/libgcc/config/i386/cygwin.S b/libgcc/config/i386/cygwin.S
new file mode 100644
index 00000000000..8f9c486850e
--- /dev/null
+++ b/libgcc/config/i386/cygwin.S
@@ -0,0 +1,188 @@
+/* stuff needed for libgcc on win32.
+ *
+ *   Copyright (C) 1996, 1998, 2001, 2003, 2008, 2009, 2010
+ *   Free Software Foundation, Inc.
+ *   Written By Steve Chamberlain
+ * 
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ * 
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ * 
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#include "auto-host.h"
+
+#ifdef HAVE_GAS_CFI_SECTIONS_DIRECTIVE
+	.cfi_sections	.debug_frame
+# define cfi_startproc()		.cfi_startproc
+# define cfi_endproc()			.cfi_endproc
+# define cfi_adjust_cfa_offset(X) 	.cfi_adjust_cfa_offset X
+# define cfi_def_cfa_register(X)	.cfi_def_cfa_register X
+# define cfi_register(D,S)		.cfi_register D, S
+# ifdef _WIN64
+#  define cfi_push(X)		.cfi_adjust_cfa_offset 8; .cfi_rel_offset X, 0
+#  define cfi_pop(X)		.cfi_adjust_cfa_offset -8; .cfi_restore X
+# else
+#  define cfi_push(X)		.cfi_adjust_cfa_offset 4; .cfi_rel_offset X, 0
+#  define cfi_pop(X)		.cfi_adjust_cfa_offset -4; .cfi_restore X
+# endif
+#else
+# define cfi_startproc()
+# define cfi_endproc()
+# define cfi_adjust_cfa_offset(X)
+# define cfi_def_cfa_register(X)
+# define cfi_register(D,S)
+# define cfi_push(X)
+# define cfi_pop(X)
+#endif /* HAVE_GAS_CFI_SECTIONS_DIRECTIVE */
+
+#ifdef L_chkstk
+/* Function prologue calls __chkstk to probe the stack when allocating more
+   than CHECK_STACK_LIMIT bytes in one go.  Touching the stack at 4K
+   increments is necessary to ensure that the guard pages used
+   by the OS virtual memory manger are allocated in correct sequence.  */
+
+	.global ___chkstk
+	.global	__alloca
+#ifdef _WIN64
+/* __alloca is a normal function call, which uses %rcx as the argument.  */
+	cfi_startproc()
+__alloca:
+	movq	%rcx, %rax
+	/* FALLTHRU */
+
+/* ___chkstk is a *special* function call, which uses %rax as the argument.
+   We avoid clobbering the 4 integer argument registers, %rcx, %rdx, 
+   %r8 and %r9, which leaves us with %rax, %r10, and %r11 to use.  */
+	.align	4
+___chkstk:
+	popq	%r11			/* pop return address */
+	cfi_adjust_cfa_offset(-8)	/* indicate return address in r11 */
+	cfi_register(%rip, %r11)
+	movq	%rsp, %r10
+	cmpq	$0x1000, %rax		/* > 4k ?*/
+	jb	2f
+
+1:	subq	$0x1000, %r10  		/* yes, move pointer down 4k*/
+	orl	$0x0, (%r10)   		/* probe there */
+	subq	$0x1000, %rax  	 	/* decrement count */
+	cmpq	$0x1000, %rax
+	ja	1b			/* and do it again */
+
+2:	subq	%rax, %r10
+	movq	%rsp, %rax		/* hold CFA until return */
+	cfi_def_cfa_register(%rax)
+	orl	$0x0, (%r10)		/* less than 4k, just peek here */
+	movq	%r10, %rsp		/* decrement stack */
+
+	/* Push the return value back.  Doing this instead of just
+	   jumping to %r11 preserves the cached call-return stack
+	   used by most modern processors.  */
+	pushq	%r11
+	ret
+	cfi_endproc()
+#else
+	cfi_startproc()
+___chkstk:
+__alloca:
+	pushl	%ecx			/* save temp */
+	cfi_push(%eax)
+	leal	8(%esp), %ecx		/* point past return addr */
+	cmpl	$0x1000, %eax		/* > 4k ?*/
+	jb	2f
+
+1:	subl	$0x1000, %ecx  		/* yes, move pointer down 4k*/
+	orl	$0x0, (%ecx)   		/* probe there */
+	subl	$0x1000, %eax  	 	/* decrement count */
+	cmpl	$0x1000, %eax
+	ja	1b			/* and do it again */
+
+2:	subl	%eax, %ecx	   
+	orl	$0x0, (%ecx)		/* less than 4k, just peek here */
+	movl	%esp, %eax		/* save current stack pointer */
+	cfi_def_cfa_register(%eax)
+	movl	%ecx, %esp		/* decrement stack */
+	movl	(%eax), %ecx		/* recover saved temp */
+
+	/* Copy the return register.  Doing this instead of just jumping to
+	   the address preserves the cached call-return stack used by most
+	   modern processors.  */
+	pushl	4(%eax)
+	ret
+	cfi_endproc()
+#endif /* _WIN64 */
+#endif /* L_chkstk */
+
+#ifdef L_chkstk_ms
+/* ___chkstk_ms is a *special* function call, which uses %rax as the argument.
+   We avoid clobbering any registers.  Unlike ___chkstk, it just probes the
+   stack and does no stack allocation.  */
+	.global ___chkstk_ms
+#ifdef _WIN64
+	cfi_startproc()
+___chkstk_ms:
+	pushq	%rcx			/* save temps */
+	cfi_push(%rcx)
+	pushq	%rax
+	cfi_push(%rax)
+	cmpq	$0x1000, %rax		/* > 4k ?*/
+	leaq	24(%rsp), %rcx		/* point past return addr */
+	jb	2f
+
+1:	subq	$0x1000, %rcx  		/* yes, move pointer down 4k */
+	orq	$0x0, (%rcx)   		/* probe there */
+	subq	$0x1000, %rax  	 	/* decrement count */
+	cmpq	$0x1000, %rax
+	ja	1b			/* and do it again */
+
+2:	subq	%rax, %rcx
+	orq	$0x0, (%rcx)		/* less than 4k, just peek here */
+
+	popq	%rax
+	cfi_pop(%rax)
+	popq	%rcx
+	cfi_pop(%rcx)
+	ret
+	cfi_endproc()
+#else
+	cfi_startproc()
+___chkstk_ms:
+	pushl	%ecx			/* save temp */
+	cfi_push(%ecx)
+	pushl	%eax
+	cfi_push(%eax)
+	cmpl	$0x1000, %eax		/* > 4k ?*/
+	leal	12(%esp), %ecx		/* point past return addr */
+	jb	2f
+
+1:	subl	$0x1000, %ecx  		/* yes, move pointer down 4k*/
+	orl	$0x0, (%ecx)   		/* probe there */
+	subl	$0x1000, %eax  	 	/* decrement count */
+	cmpl	$0x1000, %eax
+	ja	1b			/* and do it again */
+
+2:	subl	%eax, %ecx
+	orl	$0x0, (%ecx)		/* less than 4k, just peek here */
+
+	popl	%eax
+	cfi_pop(%eax)
+	popl	%ecx
+	cfi_pop(%ecx)
+	ret
+	cfi_endproc()
+#endif /* _WIN64 */
+#endif /* L_chkstk_ms */
diff --git a/libgcc/config/i386/t-chkstk b/libgcc/config/i386/t-chkstk
new file mode 100644
index 00000000000..822981faab8
--- /dev/null
+++ b/libgcc/config/i386/t-chkstk
@@ -0,0 +1,2 @@
+LIB1ASMSRC = i386/cygwin.S
+LIB1ASMFUNCS = _chkstk _chkstk_ms
diff --git a/libgcc/config/ia64/__divxf3.asm b/libgcc/config/ia64/__divxf3.S
index f741bdaf9bc..9cba8f59423 100644
--- a/libgcc/config/ia64/__divxf3.asm
+++ b/libgcc/config/ia64/__divxf3.S
@@ -3,7 +3,7 @@
 #endif
 
 #define L__divxf3
-#include "config/ia64/lib1funcs.asm"
+#include "config/ia64/lib1funcs.S"
 
 #ifdef SHARED
 #undef __divtf3
diff --git a/libgcc/config/ia64/_fixtfdi.asm b/libgcc/config/ia64/_fixtfdi.S
index 4d13c808c51..863b70f7edc 100644
--- a/libgcc/config/ia64/_fixtfdi.asm
+++ b/libgcc/config/ia64/_fixtfdi.S
@@ -3,7 +3,7 @@
 #endif
 
 #define L_fixtfdi
-#include "config/ia64/lib1funcs.asm"
+#include "config/ia64/lib1funcs.S"
 
 #ifdef SHARED
 #undef __fixtfti
diff --git a/libgcc/config/ia64/_fixunstfdi.asm b/libgcc/config/ia64/_fixunstfdi.S
index b722d9e90dc..aac6a284eaa 100644
--- a/libgcc/config/ia64/_fixunstfdi.asm
+++ b/libgcc/config/ia64/_fixunstfdi.S
@@ -3,7 +3,7 @@
 #endif
 
 #define L_fixunstfdi
-#include "config/ia64/lib1funcs.asm"
+#include "config/ia64/lib1funcs.S"
 
 #ifdef SHARED
 #undef __fixunstfti
diff --git a/libgcc/config/ia64/_floatditf.asm b/libgcc/config/ia64/_floatditf.S
index 21d77028176..e37404d26d5 100644
--- a/libgcc/config/ia64/_floatditf.asm
+++ b/libgcc/config/ia64/_floatditf.S
@@ -3,7 +3,7 @@
 #endif
 
 #define L_floatditf
-#include "config/ia64/lib1funcs.asm"
+#include "config/ia64/lib1funcs.S"
 
 #ifdef SHARED
 #undef __floattitf
diff --git a/libgcc/config/ia64/lib1funcs.S b/libgcc/config/ia64/lib1funcs.S
new file mode 100644
index 00000000000..b7eaa6eca3c
--- /dev/null
+++ b/libgcc/config/ia64/lib1funcs.S
@@ -0,0 +1,795 @@
+/* Copyright (C) 2000, 2001, 2003, 2005, 2009 Free Software Foundation, Inc.
+   Contributed by James E. Wilson <wilson@cygnus.com>.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef L__divxf3
+// Compute a 80-bit IEEE double-extended quotient.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// farg0 holds the dividend.  farg1 holds the divisor.
+//
+// __divtf3 is an alternate symbol name for backward compatibility.
+
+	.text
+	.align 16
+	.global __divxf3
+	.proc __divxf3
+__divxf3:
+#ifdef SHARED
+	.global __divtf3
+__divtf3:
+#endif
+	cmp.eq p7, p0 = r0, r0
+	frcpa.s0 f10, p6 = farg0, farg1
+	;;
+(p6)	cmp.ne p7, p0 = r0, r0
+	.pred.rel.mutex p6, p7
+(p6)	fnma.s1 f11 = farg1, f10, f1
+(p6)	fma.s1 f12 = farg0, f10, f0
+	;;
+(p6)	fma.s1 f13 = f11, f11, f0
+(p6)	fma.s1 f14 = f11, f11, f11
+	;;
+(p6)	fma.s1 f11 = f13, f13, f11
+(p6)	fma.s1 f13 = f14, f10, f10
+	;;
+(p6)	fma.s1 f10 = f13, f11, f10
+(p6)	fnma.s1 f11 = farg1, f12, farg0
+	;;
+(p6)	fma.s1 f11 = f11, f10, f12
+(p6)	fnma.s1 f12 = farg1, f10, f1
+	;;
+(p6)	fma.s1 f10 = f12, f10, f10
+(p6)	fnma.s1 f12 = farg1, f11, farg0
+	;;
+(p6)	fma.s0 fret0 = f12, f10, f11
+(p7)	mov fret0 = f10
+	br.ret.sptk rp
+	.endp __divxf3
+#endif
+
+#ifdef L__divdf3
+// Compute a 64-bit IEEE double quotient.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// farg0 holds the dividend.  farg1 holds the divisor.
+
+	.text
+	.align 16
+	.global __divdf3
+	.proc __divdf3
+__divdf3:
+	cmp.eq p7, p0 = r0, r0
+	frcpa.s0 f10, p6 = farg0, farg1
+	;;
+(p6)	cmp.ne p7, p0 = r0, r0
+	.pred.rel.mutex p6, p7
+(p6)	fmpy.s1 f11 = farg0, f10
+(p6)	fnma.s1 f12 = farg1, f10, f1
+	;;
+(p6)	fma.s1 f11 = f12, f11, f11
+(p6)	fmpy.s1 f13 = f12, f12
+	;;
+(p6)	fma.s1 f10 = f12, f10, f10
+(p6)	fma.s1 f11 = f13, f11, f11
+	;;
+(p6)	fmpy.s1 f12 = f13, f13
+(p6)	fma.s1 f10 = f13, f10, f10
+	;;
+(p6)	fma.d.s1 f11 = f12, f11, f11
+(p6)	fma.s1 f10 = f12, f10, f10
+	;;
+(p6)	fnma.d.s1 f8 = farg1, f11, farg0
+	;;
+(p6)	fma.d fret0 = f8, f10, f11
+(p7)	mov fret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __divdf3
+#endif
+
+#ifdef L__divsf3
+// Compute a 32-bit IEEE float quotient.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// farg0 holds the dividend.  farg1 holds the divisor.
+
+	.text
+	.align 16
+	.global __divsf3
+	.proc __divsf3
+__divsf3:
+	cmp.eq p7, p0 = r0, r0
+	frcpa.s0 f10, p6 = farg0, farg1
+	;;
+(p6)	cmp.ne p7, p0 = r0, r0
+	.pred.rel.mutex p6, p7
+(p6)	fmpy.s1 f8 = farg0, f10
+(p6)	fnma.s1 f9 = farg1, f10, f1
+	;;
+(p6)	fma.s1 f8 = f9, f8, f8
+(p6)	fmpy.s1 f9 = f9, f9
+	;;
+(p6)	fma.s1 f8 = f9, f8, f8
+(p6)	fmpy.s1 f9 = f9, f9
+	;;
+(p6)	fma.d.s1 f10 = f9, f8, f8
+	;;
+(p6)	fnorm.s.s0 fret0 = f10
+(p7)	mov fret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __divsf3
+#endif
+
+#ifdef L__divdi3
+// Compute a 64-bit integer quotient.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// in0 holds the dividend.  in1 holds the divisor.
+
+	.text
+	.align 16
+	.global __divdi3
+	.proc __divdi3
+__divdi3:
+	.regstk 2,0,0,0
+	// Transfer inputs to FP registers.
+	setf.sig f8 = in0
+	setf.sig f9 = in1
+	// Check divide by zero.
+	cmp.ne.unc p0,p7=0,in1
+	;;
+	// Convert the inputs to FP, so that they won't be treated as unsigned.
+	fcvt.xf f8 = f8
+	fcvt.xf f9 = f9
+(p7)	break 1
+	;;
+	// Compute the reciprocal approximation.
+	frcpa.s1 f10, p6 = f8, f9
+	;;
+	// 3 Newton-Raphson iterations.
+(p6)	fnma.s1 f11 = f9, f10, f1
+(p6)	fmpy.s1 f12 = f8, f10
+	;;
+(p6)	fmpy.s1 f13 = f11, f11
+(p6)	fma.s1 f12 = f11, f12, f12
+	;;
+(p6)	fma.s1 f10 = f11, f10, f10
+(p6)	fma.s1 f11 = f13, f12, f12
+	;;
+(p6)	fma.s1 f10 = f13, f10, f10
+(p6)	fnma.s1 f12 = f9, f11, f8
+	;;
+(p6)	fma.s1 f10 = f12, f10, f11
+	;;
+	// Round quotient to an integer.
+	fcvt.fx.trunc.s1 f10 = f10
+	;;
+	// Transfer result to GP registers.
+	getf.sig ret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __divdi3
+#endif
+
+#ifdef L__moddi3
+// Compute a 64-bit integer modulus.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// in0 holds the dividend (a).  in1 holds the divisor (b).
+
+	.text
+	.align 16
+	.global __moddi3
+	.proc __moddi3
+__moddi3:
+	.regstk 2,0,0,0
+	// Transfer inputs to FP registers.
+	setf.sig f14 = in0
+	setf.sig f9 = in1
+	// Check divide by zero.
+	cmp.ne.unc p0,p7=0,in1
+	;;
+	// Convert the inputs to FP, so that they won't be treated as unsigned.
+	fcvt.xf f8 = f14
+	fcvt.xf f9 = f9
+(p7)	break 1
+	;;
+	// Compute the reciprocal approximation.
+	frcpa.s1 f10, p6 = f8, f9
+	;;
+	// 3 Newton-Raphson iterations.
+(p6)	fmpy.s1 f12 = f8, f10
+(p6)	fnma.s1 f11 = f9, f10, f1
+	;;
+(p6)	fma.s1 f12 = f11, f12, f12
+(p6)	fmpy.s1 f13 = f11, f11
+	;;
+(p6)	fma.s1 f10 = f11, f10, f10
+(p6)	fma.s1 f11 = f13, f12, f12
+	;;
+	sub in1 = r0, in1
+(p6)	fma.s1 f10 = f13, f10, f10
+(p6)	fnma.s1 f12 = f9, f11, f8
+	;;
+	setf.sig f9 = in1
+(p6)	fma.s1 f10 = f12, f10, f11
+	;;
+	fcvt.fx.trunc.s1 f10 = f10
+	;;
+	// r = q * (-b) + a
+	xma.l f10 = f10, f9, f14
+	;;
+	// Transfer result to GP registers.
+	getf.sig ret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __moddi3
+#endif
+
+#ifdef L__udivdi3
+// Compute a 64-bit unsigned integer quotient.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// in0 holds the dividend.  in1 holds the divisor.
+
+	.text
+	.align 16
+	.global __udivdi3
+	.proc __udivdi3
+__udivdi3:
+	.regstk 2,0,0,0
+	// Transfer inputs to FP registers.
+	setf.sig f8 = in0
+	setf.sig f9 = in1
+	// Check divide by zero.
+	cmp.ne.unc p0,p7=0,in1
+	;;
+	// Convert the inputs to FP, to avoid FP software-assist faults.
+	fcvt.xuf.s1 f8 = f8
+	fcvt.xuf.s1 f9 = f9
+(p7)	break 1
+	;;
+	// Compute the reciprocal approximation.
+	frcpa.s1 f10, p6 = f8, f9
+	;;
+	// 3 Newton-Raphson iterations.
+(p6)	fnma.s1 f11 = f9, f10, f1
+(p6)	fmpy.s1 f12 = f8, f10
+	;;
+(p6)	fmpy.s1 f13 = f11, f11
+(p6)	fma.s1 f12 = f11, f12, f12
+	;;
+(p6)	fma.s1 f10 = f11, f10, f10
+(p6)	fma.s1 f11 = f13, f12, f12
+	;;
+(p6)	fma.s1 f10 = f13, f10, f10
+(p6)	fnma.s1 f12 = f9, f11, f8
+	;;
+(p6)	fma.s1 f10 = f12, f10, f11
+	;;
+	// Round quotient to an unsigned integer.
+	fcvt.fxu.trunc.s1 f10 = f10
+	;;
+	// Transfer result to GP registers.
+	getf.sig ret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __udivdi3
+#endif
+
+#ifdef L__umoddi3
+// Compute a 64-bit unsigned integer modulus.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// in0 holds the dividend (a).  in1 holds the divisor (b).
+
+	.text
+	.align 16
+	.global __umoddi3
+	.proc __umoddi3
+__umoddi3:
+	.regstk 2,0,0,0
+	// Transfer inputs to FP registers.
+	setf.sig f14 = in0
+	setf.sig f9 = in1
+	// Check divide by zero.
+	cmp.ne.unc p0,p7=0,in1
+	;;
+	// Convert the inputs to FP, to avoid FP software assist faults.
+	fcvt.xuf.s1 f8 = f14
+	fcvt.xuf.s1 f9 = f9
+(p7)	break 1;
+	;;
+	// Compute the reciprocal approximation.
+	frcpa.s1 f10, p6 = f8, f9
+	;;
+	// 3 Newton-Raphson iterations.
+(p6)	fmpy.s1 f12 = f8, f10
+(p6)	fnma.s1 f11 = f9, f10, f1
+	;;
+(p6)	fma.s1 f12 = f11, f12, f12
+(p6)	fmpy.s1 f13 = f11, f11
+	;;
+(p6)	fma.s1 f10 = f11, f10, f10
+(p6)	fma.s1 f11 = f13, f12, f12
+	;;
+	sub in1 = r0, in1
+(p6)	fma.s1 f10 = f13, f10, f10
+(p6)	fnma.s1 f12 = f9, f11, f8
+	;;
+	setf.sig f9 = in1
+(p6)	fma.s1 f10 = f12, f10, f11
+	;;
+	// Round quotient to an unsigned integer.
+	fcvt.fxu.trunc.s1 f10 = f10
+	;;
+	// r = q * (-b) + a
+	xma.l f10 = f10, f9, f14
+	;;
+	// Transfer result to GP registers.
+	getf.sig ret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __umoddi3
+#endif
+
+#ifdef L__divsi3
+// Compute a 32-bit integer quotient.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// in0 holds the dividend.  in1 holds the divisor.
+
+	.text
+	.align 16
+	.global __divsi3
+	.proc __divsi3
+__divsi3:
+	.regstk 2,0,0,0
+	// Check divide by zero.
+	cmp.ne.unc p0,p7=0,in1
+	sxt4 in0 = in0
+	sxt4 in1 = in1
+	;;
+	setf.sig f8 = in0
+	setf.sig f9 = in1
+(p7)	break 1
+	;;
+	mov r2 = 0x0ffdd
+	fcvt.xf f8 = f8
+	fcvt.xf f9 = f9
+	;;
+	setf.exp f11 = r2
+	frcpa.s1 f10, p6 = f8, f9
+	;;
+(p6)	fmpy.s1 f8 = f8, f10
+(p6)	fnma.s1 f9 = f9, f10, f1
+	;;
+(p6)	fma.s1 f8 = f9, f8, f8
+(p6)	fma.s1 f9 = f9, f9, f11
+	;;
+(p6)	fma.s1 f10 = f9, f8, f8
+	;;
+	fcvt.fx.trunc.s1 f10 = f10
+	;;
+	getf.sig ret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __divsi3
+#endif
+
+#ifdef L__modsi3
+// Compute a 32-bit integer modulus.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// in0 holds the dividend.  in1 holds the divisor.
+
+	.text
+	.align 16
+	.global __modsi3
+	.proc __modsi3
+__modsi3:
+	.regstk 2,0,0,0
+	mov r2 = 0x0ffdd
+	sxt4 in0 = in0
+	sxt4 in1 = in1
+	;;
+	setf.sig f13 = r32
+	setf.sig f9 = r33
+	// Check divide by zero.
+	cmp.ne.unc p0,p7=0,in1
+	;;
+	sub in1 = r0, in1
+	fcvt.xf f8 = f13
+	fcvt.xf f9 = f9
+	;;
+	setf.exp f11 = r2
+	frcpa.s1 f10, p6 = f8, f9
+(p7)	break 1
+	;;
+(p6)	fmpy.s1 f12 = f8, f10
+(p6)	fnma.s1 f10 = f9, f10, f1
+	;;
+	setf.sig f9 = in1
+(p6)	fma.s1 f12 = f10, f12, f12
+(p6)	fma.s1 f10 = f10, f10, f11	
+	;;
+(p6)	fma.s1 f10 = f10, f12, f12
+	;;
+	fcvt.fx.trunc.s1 f10 = f10
+	;;
+	xma.l f10 = f10, f9, f13
+	;;
+	getf.sig ret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __modsi3
+#endif
+
+#ifdef L__udivsi3
+// Compute a 32-bit unsigned integer quotient.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// in0 holds the dividend.  in1 holds the divisor.
+
+	.text
+	.align 16
+	.global __udivsi3
+	.proc __udivsi3
+__udivsi3:
+	.regstk 2,0,0,0
+	mov r2 = 0x0ffdd
+	zxt4 in0 = in0
+	zxt4 in1 = in1
+	;;
+	setf.sig f8 = in0
+	setf.sig f9 = in1
+	// Check divide by zero.
+	cmp.ne.unc p0,p7=0,in1
+	;;
+	fcvt.xf f8 = f8
+	fcvt.xf f9 = f9
+(p7)	break 1
+	;;
+	setf.exp f11 = r2
+	frcpa.s1 f10, p6 = f8, f9
+	;;
+(p6)	fmpy.s1 f8 = f8, f10
+(p6)	fnma.s1 f9 = f9, f10, f1
+	;;
+(p6)	fma.s1 f8 = f9, f8, f8
+(p6)	fma.s1 f9 = f9, f9, f11
+	;;
+(p6)	fma.s1 f10 = f9, f8, f8
+	;;
+	fcvt.fxu.trunc.s1 f10 = f10
+	;;
+	getf.sig ret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __udivsi3
+#endif
+
+#ifdef L__umodsi3
+// Compute a 32-bit unsigned integer modulus.
+//
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
+//
+// in0 holds the dividend.  in1 holds the divisor.
+
+	.text
+	.align 16
+	.global __umodsi3
+	.proc __umodsi3
+__umodsi3:
+	.regstk 2,0,0,0
+	mov r2 = 0x0ffdd
+	zxt4 in0 = in0
+	zxt4 in1 = in1
+	;;
+	setf.sig f13 = in0
+	setf.sig f9 = in1
+	// Check divide by zero.
+	cmp.ne.unc p0,p7=0,in1
+	;;
+	sub in1 = r0, in1
+	fcvt.xf f8 = f13
+	fcvt.xf f9 = f9
+	;;
+	setf.exp f11 = r2
+	frcpa.s1 f10, p6 = f8, f9
+(p7)	break 1;
+	;;
+(p6)	fmpy.s1 f12 = f8, f10
+(p6)	fnma.s1 f10 = f9, f10, f1
+	;;
+	setf.sig f9 = in1
+(p6)	fma.s1 f12 = f10, f12, f12
+(p6)	fma.s1 f10 = f10, f10, f11
+	;;
+(p6)	fma.s1 f10 = f10, f12, f12
+	;;
+	fcvt.fxu.trunc.s1 f10 = f10
+	;;
+	xma.l f10 = f10, f9, f13
+	;;
+	getf.sig ret0 = f10
+	br.ret.sptk rp
+	;;
+	.endp __umodsi3
+#endif
+
+#ifdef L__save_stack_nonlocal
+// Notes on save/restore stack nonlocal: We read ar.bsp but write
+// ar.bspstore.  This is because ar.bsp can be read at all times
+// (independent of the RSE mode) but since it's read-only we need to
+// restore the value via ar.bspstore.  This is OK because
+// ar.bsp==ar.bspstore after executing "flushrs".
+
+// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
+
+	.text
+	.align 16
+	.global __ia64_save_stack_nonlocal
+	.proc __ia64_save_stack_nonlocal
+__ia64_save_stack_nonlocal:
+	{ .mmf
+	  alloc r18 = ar.pfs, 2, 0, 0, 0
+	  mov r19 = ar.rsc
+	  ;;
+	}
+	{ .mmi
+	  flushrs
+	  st8 [in0] = in1, 24
+	  and r19 = 0x1c, r19
+	  ;;
+	}
+	{ .mmi
+	  st8 [in0] = r18, -16
+	  mov ar.rsc = r19
+	  or r19 = 0x3, r19
+	  ;;
+	}
+	{ .mmi
+	  mov r16 = ar.bsp
+	  mov r17 = ar.rnat
+	  adds r2 = 8, in0
+	  ;;
+	}
+	{ .mmi
+	  st8 [in0] = r16
+	  st8 [r2] = r17
+	}
+	{ .mib
+	  mov ar.rsc = r19
+	  br.ret.sptk.few rp
+	  ;;
+	}
+	.endp __ia64_save_stack_nonlocal
+#endif
+
+#ifdef L__nonlocal_goto
+// void __ia64_nonlocal_goto(void *target_label, void *save_area,
+//			     void *static_chain);
+
+	.text
+	.align 16
+	.global __ia64_nonlocal_goto
+	.proc __ia64_nonlocal_goto
+__ia64_nonlocal_goto:
+	{ .mmi
+	  alloc r20 = ar.pfs, 3, 0, 0, 0
+	  ld8 r12 = [in1], 8
+	  mov.ret.sptk rp = in0, .L0
+	  ;;
+	}
+	{ .mmf
+	  ld8 r16 = [in1], 8
+	  mov r19 = ar.rsc
+	  ;;
+	}
+	{ .mmi
+	  flushrs
+	  ld8 r17 = [in1], 8
+	  and r19 = 0x1c, r19
+	  ;;
+	}
+	{ .mmi
+	  ld8 r18 = [in1]
+	  mov ar.rsc = r19
+	  or r19 = 0x3, r19
+	  ;;
+	}
+	{ .mmi
+	  mov ar.bspstore = r16
+	  ;;
+	  mov ar.rnat = r17
+	  ;;
+	}
+	{ .mmi
+	  loadrs
+	  invala
+	  mov r15 = in2
+	  ;;
+	}
+.L0:	{ .mib
+	  mov ar.rsc = r19
+	  mov ar.pfs = r18
+	  br.ret.sptk.few rp
+	  ;;
+	}
+	.endp __ia64_nonlocal_goto
+#endif
+
+#ifdef L__restore_stack_nonlocal
+// This is mostly the same as nonlocal_goto above.
+// ??? This has not been tested yet.
+
+// void __ia64_restore_stack_nonlocal(void *save_area)
+
+	.text
+	.align 16
+	.global __ia64_restore_stack_nonlocal
+	.proc __ia64_restore_stack_nonlocal
+__ia64_restore_stack_nonlocal:
+	{ .mmf
+	  alloc r20 = ar.pfs, 4, 0, 0, 0
+	  ld8 r12 = [in0], 8
+	  ;;
+	}
+	{ .mmb
+	  ld8 r16=[in0], 8
+	  mov r19 = ar.rsc
+	  ;;
+	}
+	{ .mmi
+	  flushrs
+	  ld8 r17 = [in0], 8
+	  and r19 = 0x1c, r19
+	  ;;
+	}
+	{ .mmf
+	  ld8 r18 = [in0]
+	  mov ar.rsc = r19
+	  ;;
+	}
+	{ .mmi
+	  mov ar.bspstore = r16
+	  ;;
+	  mov ar.rnat = r17
+	  or r19 = 0x3, r19
+	  ;;
+	}
+	{ .mmf
+	  loadrs
+	  invala
+	  ;;
+	}
+.L0:	{ .mib
+	  mov ar.rsc = r19
+	  mov ar.pfs = r18
+	  br.ret.sptk.few rp
+	  ;;
+	}
+	.endp __ia64_restore_stack_nonlocal
+#endif
+
+#ifdef L__trampoline
+// Implement the nested function trampoline.  This is out of line
+// so that we don't have to bother with flushing the icache, as
+// well as making the on-stack trampoline smaller.
+//
+// The trampoline has the following form:
+//
+//		+-------------------+ >
+//	TRAMP:	| __ia64_trampoline | |
+//		+-------------------+  > fake function descriptor
+//		| TRAMP+16          | |
+//		+-------------------+ >
+//		| target descriptor |
+//		+-------------------+
+//		| static link	    |
+//		+-------------------+
+
+	.text
+	.align 16
+	.global __ia64_trampoline
+	.proc __ia64_trampoline
+__ia64_trampoline:
+	{ .mmi
+	  ld8 r2 = [r1], 8
+	  ;;
+	  ld8 r15 = [r1]
+	}
+	{ .mmi
+	  ld8 r3 = [r2], 8
+	  ;;
+	  ld8 r1 = [r2]
+	  mov b6 = r3
+	}
+	{ .bbb
+	  br.sptk.many b6
+	  ;;
+	}
+	.endp __ia64_trampoline
+#endif
+
+#ifdef SHARED
+// Thunks for backward compatibility.
+#ifdef L_fixtfdi
+	.text
+	.align 16
+	.global __fixtfti
+	.proc __fixtfti
+__fixtfti:
+	{ .bbb
+	  br.sptk.many __fixxfti
+	  ;;
+	}
+	.endp __fixtfti
+#endif
+#ifdef L_fixunstfdi
+	.align 16
+	.global __fixunstfti
+	.proc __fixunstfti
+__fixunstfti:
+	{ .bbb
+	  br.sptk.many __fixunsxfti
+	  ;;
+	}
+	.endp __fixunstfti
+#endif
+#ifdef L_floatditf
+	.align 16
+	.global __floattitf
+	.proc __floattitf
+__floattitf:
+	{ .bbb
+	  br.sptk.many __floattixf
+	  ;;
+	}
+	.endp __floattitf
+#endif
+#endif
diff --git a/libgcc/config/ia64/t-hpux b/libgcc/config/ia64/t-hpux
index ef3387e7a61..1fee41385c0 100644
--- a/libgcc/config/ia64/t-hpux
+++ b/libgcc/config/ia64/t-hpux
@@ -1 +1,6 @@
+# On HP-UX we do not want _fixtfdi, _fixunstfdi, or _floatditf from
+# LIB1ASMSRC.  These functions map the 128 bit conversion function names
+# to 80 bit conversions and were done for Linux backwards compatibility.
+LIB1ASMFUNCS := $(filter-out _fixtfdi _fixunstfdi _floatditf,$(LIB1ASMFUNCS))
+
 LIB2ADDEH = $(srcdir)/unwind-c.c
diff --git a/libgcc/config/ia64/t-ia64 b/libgcc/config/ia64/t-ia64
index 59cf3aa75f4..80445d8a2a8 100644
--- a/libgcc/config/ia64/t-ia64
+++ b/libgcc/config/ia64/t-ia64
@@ -1,3 +1,16 @@
+LIB1ASMSRC    = ia64/lib1funcs.S
+
+# We use different names for the DImode div/mod files so that they won't
+# conflict with libgcc2.c files.  We used to use __ia64 as a prefix, now
+# we use __ as the prefix.  Note that L_divdi3 in libgcc2.c actually defines
+# a TImode divide function, so there is no actual overlap here between
+# libgcc2.c and lib1funcs.S.
+LIB1ASMFUNCS  = __divxf3 __divdf3 __divsf3 \
+	__divdi3 __moddi3 __udivdi3 __umoddi3 \
+	__divsi3 __modsi3 __udivsi3 __umodsi3 __save_stack_nonlocal \
+	__nonlocal_goto __restore_stack_nonlocal __trampoline \
+	_fixtfdi _fixunstfdi _floatditf
+
 CUSTOM_CRTSTUFF = yes
 
 # Assemble startup files.
diff --git a/libgcc/config/ia64/t-softfp-compat b/libgcc/config/ia64/t-softfp-compat
index d3dad68c48f..00f45d51cd0 100644
--- a/libgcc/config/ia64/t-softfp-compat
+++ b/libgcc/config/ia64/t-softfp-compat
@@ -3,5 +3,5 @@
 # Replace __dvxf3 _fixtfdi _fixunstfdi _floatditf
 libgcc1-tf-functions = __divxf3  _fixtfdi _fixunstfdi _floatditf
 LIB1ASMFUNCS := $(filter-out $(libgcc1-tf-functions), $(LIB1ASMFUNCS))
-libgcc1-tf-compats = $(addsuffix .asm, $(libgcc1-tf-functions))
+libgcc1-tf-compats = $(addsuffix .S, $(libgcc1-tf-functions))
 LIB2ADD += $(addprefix $(srcdir)/config/ia64/, $(libgcc1-tf-compats))
diff --git a/libgcc/config/m32c/lib1funcs.S b/libgcc/config/m32c/lib1funcs.S
new file mode 100644
index 00000000000..9b657787187
--- /dev/null
+++ b/libgcc/config/m32c/lib1funcs.S
@@ -0,0 +1,231 @@
+/* libgcc routines for R8C/M16C/M32C
+   Copyright (C) 2005, 2009, 2010
+   Free Software Foundation, Inc.
+   Contributed by Red Hat.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined(__r8c_cpu__) || defined(__m16c_cpu__)
+#define A16
+#define A(n,w) n
+#define W w
+#else
+#define A24
+#define A(n,w) w
+#define W l
+#endif
+
+
+#ifdef L__m32c_memregs
+
+/* Warning: these memory locations are used as a register bank.  They
+   *must* end up consecutive in any final executable, so you may *not*
+   use the otherwise obvious ".comm" directive to allocate space for
+   them. */
+
+	.bss
+	.global	mem0
+mem0:	.space	1
+	.global	mem1
+mem1:	.space	1
+	.global	mem2
+mem2:	.space	1
+	.global	mem3
+mem3:	.space	1
+	.global	mem4
+mem4:	.space	1
+	.global	mem5
+mem5:	.space	1
+	.global	mem6
+mem6:	.space	1
+	.global	mem7
+mem7:	.space	1
+	.global	mem8
+mem8:	.space	1
+	.global	mem9
+mem9:	.space	1
+	.global	mem10
+mem10:	.space	1
+	.global	mem11
+mem11:	.space	1
+	.global	mem12
+mem12:	.space	1
+	.global	mem13
+mem13:	.space	1
+	.global	mem14
+mem14:	.space	1
+	.global	mem15
+mem15:	.space	1
+
+#endif
+
+#ifdef L__m32c_eh_return
+	.text
+	.global __m32c_eh_return
+__m32c_eh_return:	
+
+	/* At this point, r0 has the stack adjustment, r1r3 has the
+	   address to return to.  The stack looks like this:
+
+	   old_ra
+	   old_fp
+	   <- unwound sp
+	   ...
+	   fb
+	   through
+	   r0
+	   <- sp
+
+	   What we need to do is restore all the registers, update the
+	   stack, and return to the right place.
+	*/
+
+	stc	sp,a0
+	
+	add.W	A(#16,#24),a0
+	/* a0 points to the current stack, just above the register
+	   save areas */
+
+	mov.w	a0,a1
+	exts.w	r0
+	sub.W	A(r0,r2r0),a1
+	sub.W	A(#3,#4),a1
+	/* a1 points to the new stack.  */
+
+	/* This is for the "rts" below.  */
+	mov.w	r1,[a1]
+#ifdef A16
+	mov.w	r2,r1
+	mov.b	r1l,2[a1]
+#else
+	mov.w	r2,2[a1]
+#endif
+
+	/* This is for the "popc sp" below.  */
+	mov.W	a1,[a0]	
+
+	popm    r0,r1,r2,r3,a0,a1,sb,fb
+	popc	sp
+	rts
+#endif
+
+/* SImode arguments for SI foo(SI,SI) functions.  */
+#ifdef A16
+#define SAL  5[fb]
+#define SAH  7[fb]
+#define SBL  9[fb]
+#define SBH 11[fb]
+#else
+#define SAL  8[fb]
+#define SAH 10[fb]
+#define SBL 12[fb]
+#define SBH 14[fb]
+#endif
+
+#ifdef L__m32c_mulsi3
+	.text
+	.global ___mulsi3
+___mulsi3:
+	enter	#0
+	push.w	r2
+	mov.w	SAL,r0
+	mulu.w	SBL,r0		/* writes to r2r0 */
+	mov.w	r0,mem0
+	mov.w	r2,mem2
+	mov.w	SAL,r0
+	mulu.w	SBH,r0		/* writes to r2r0 */
+	add.w	r0,mem2
+	mov.w	SAH,r0
+	mulu.w	SBL,r0		/* writes to r2r0 */
+	add.w	r0,mem2
+	pop.w	r2
+	exitd
+#endif
+
+#ifdef L__m32c_cmpsi2
+	.text
+	.global ___cmpsi2
+___cmpsi2:
+	enter	#0
+	cmp.w	SBH,SAH
+	jgt	cmpsi_gt
+	jlt	cmpsi_lt
+	cmp.w	SBL,SAL
+	jgt	cmpsi_gt
+	jlt	cmpsi_lt
+	mov.w	#1,r0
+	exitd
+cmpsi_gt:
+	mov.w	#2,r0
+	exitd
+cmpsi_lt:
+	mov.w	#0,r0
+	exitd
+#endif
+
+#ifdef L__m32c_ucmpsi2
+	.text
+	.global ___ucmpsi2
+___ucmpsi2:
+	enter	#0
+	cmp.w	SBH,SAH
+	jgtu	cmpsi_gt
+	jltu	cmpsi_lt
+	cmp.w	SBL,SAL
+	jgtu	cmpsi_gt
+	jltu	cmpsi_lt
+	mov.w	#1,r0
+	exitd
+cmpsi_gt:
+	mov.w	#2,r0
+	exitd
+cmpsi_lt:
+	mov.w	#0,r0
+	exitd
+#endif
+
+#ifdef L__m32c_jsri16
+	.text
+#ifdef A16
+	.global	m32c_jsri16
+m32c_jsri16:
+	add.w	#-1, sp
+
+	/* Read the address (16 bits) and return address (24 bits) off
+	the stack.  */
+	mov.w	4[sp], r0
+	mov.w	1[sp], r3
+	mov.b	3[sp], a0 /* This zero-extends, so the high byte has
+			     zero in it.  */
+
+	/* Write the return address, then new address, to the stack.  */
+	mov.w	a0, 1[sp] /* Just to get the zero in 2[sp].  */
+	mov.w	r0, 0[sp]
+	mov.w	r3, 3[sp]
+	mov.b	a0, 5[sp]
+
+	/* This "returns" to the target address, leaving the pending
+	return address on the stack.  */
+	rts
+#endif
+
+#endif
diff --git a/libgcc/config/m32c/t-m32c b/libgcc/config/m32c/t-m32c
new file mode 100644
index 00000000000..d21483750fd
--- /dev/null
+++ b/libgcc/config/m32c/t-m32c
@@ -0,0 +1,9 @@
+LIB1ASMSRC = m32c/lib1funcs.S
+
+LIB1ASMFUNCS = \
+	__m32c_memregs \
+	__m32c_eh_return \
+	__m32c_mulsi3 \
+	__m32c_cmpsi2 \
+	__m32c_ucmpsi2 \
+	__m32c_jsri16
diff --git a/libgcc/config/m32r/initfini.c b/libgcc/config/m32r/initfini.c
index 6e7d58614c7..56332459223 100644
--- a/libgcc/config/m32r/initfini.c
+++ b/libgcc/config/m32r/initfini.c
@@ -1,5 +1,5 @@
 /* .init/.fini section handling + C++ global constructor/destructor handling.
-   This file is based on crtstuff.c, sol2-crti.asm, sol2-crtn.asm.
+   This file is based on crtstuff.c, sol2-crti.S, sol2-crtn.S.
 
    Copyright (C) 1996, 1997, 1998, 2006, 2009 Free Software Foundation, Inc.
 
diff --git a/libgcc/config/m68k/lb1sf68.S b/libgcc/config/m68k/lb1sf68.S
new file mode 100644
index 00000000000..0339a092c4f
--- /dev/null
+++ b/libgcc/config/m68k/lb1sf68.S
@@ -0,0 +1,4116 @@
+/* libgcc routines for 68000 w/o floating-point hardware.
+   Copyright (C) 1994, 1996, 1997, 1998, 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Use this one for any 680x0; assumes no floating point hardware.
+   The trailing " '" appearing on some lines is for ANSI preprocessors.  Yuk.
+   Some of this code comes from MINIX, via the folks at ericsson.
+   D. V. Henkel-Wallace (gumby@cygnus.com) Fete Bastille, 1992
+*/
+
+/* These are predefined by new versions of GNU cpp.  */
+
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__ _
+#endif
+
+#ifndef __REGISTER_PREFIX__
+#define __REGISTER_PREFIX__
+#endif
+
+#ifndef __IMMEDIATE_PREFIX__
+#define __IMMEDIATE_PREFIX__ #
+#endif
+
+/* ANSI concatenation macros.  */
+
+#define CONCAT1(a, b) CONCAT2(a, b)
+#define CONCAT2(a, b) a ## b
+
+/* Use the right prefix for global labels.  */
+
+#define SYM(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
+
+/* Note that X is a function.  */
+	
+#ifdef __ELF__
+#define FUNC(x) .type SYM(x),function
+#else
+/* The .proc pseudo-op is accepted, but ignored, by GAS.  We could just	
+   define this to the empty string for non-ELF systems, but defining it
+   to .proc means that the information is available to the assembler if
+   the need arises.  */
+#define FUNC(x) .proc
+#endif
+		
+/* Use the right prefix for registers.  */
+
+#define REG(x) CONCAT1 (__REGISTER_PREFIX__, x)
+
+/* Use the right prefix for immediate values.  */
+
+#define IMM(x) CONCAT1 (__IMMEDIATE_PREFIX__, x)
+
+#define d0 REG (d0)
+#define d1 REG (d1)
+#define d2 REG (d2)
+#define d3 REG (d3)
+#define d4 REG (d4)
+#define d5 REG (d5)
+#define d6 REG (d6)
+#define d7 REG (d7)
+#define a0 REG (a0)
+#define a1 REG (a1)
+#define a2 REG (a2)
+#define a3 REG (a3)
+#define a4 REG (a4)
+#define a5 REG (a5)
+#define a6 REG (a6)
+#define fp REG (fp)
+#define sp REG (sp)
+#define pc REG (pc)
+
+/* Provide a few macros to allow for PIC code support.
+ * With PIC, data is stored A5 relative so we've got to take a bit of special
+ * care to ensure that all loads of global data is via A5.  PIC also requires
+ * jumps and subroutine calls to be PC relative rather than absolute.  We cheat
+ * a little on this and in the PIC case, we use short offset branches and
+ * hope that the final object code is within range (which it should be).
+ */
+#ifndef __PIC__
+
+	/* Non PIC (absolute/relocatable) versions */
+
+	.macro PICCALL addr
+	jbsr	\addr
+	.endm
+
+	.macro PICJUMP addr
+	jmp	\addr
+	.endm
+
+	.macro PICLEA sym, reg
+	lea	\sym, \reg
+	.endm
+
+	.macro PICPEA sym, areg
+	pea	\sym
+	.endm
+
+#else /* __PIC__ */
+
+# if defined (__uClinux__)
+
+	/* Versions for uClinux */
+
+#  if defined(__ID_SHARED_LIBRARY__)
+
+	/* -mid-shared-library versions  */
+
+	.macro PICLEA sym, reg
+	movel	a5@(_current_shared_library_a5_offset_), \reg
+	movel	\sym@GOT(\reg), \reg
+	.endm
+
+	.macro PICPEA sym, areg
+	movel	a5@(_current_shared_library_a5_offset_), \areg
+	movel	\sym@GOT(\areg), sp@-
+	.endm
+
+	.macro PICCALL addr
+	PICLEA	\addr,a0
+	jsr	a0@
+	.endm
+
+	.macro PICJUMP addr
+	PICLEA	\addr,a0
+	jmp	a0@
+	.endm
+
+#  else /* !__ID_SHARED_LIBRARY__ */
+
+	/* Versions for -msep-data */
+
+	.macro PICLEA sym, reg
+	movel	\sym@GOT(a5), \reg
+	.endm
+
+	.macro PICPEA sym, areg
+	movel	\sym@GOT(a5), sp@-
+	.endm
+
+	.macro PICCALL addr
+#if defined (__mcoldfire__) && !defined (__mcfisab__) && !defined (__mcfisac__)
+	lea	\addr-.-8,a0
+	jsr	pc@(a0)
+#else
+	jbsr	\addr
+#endif
+	.endm
+
+	.macro PICJUMP addr
+	/* ISA C has no bra.l instruction, and since this assembly file
+	   gets assembled into multiple object files, we avoid the
+	   bra instruction entirely.  */
+#if defined (__mcoldfire__) && !defined (__mcfisab__)
+	lea	\addr-.-8,a0
+	jmp	pc@(a0)
+#else
+	bra	\addr
+#endif
+	.endm
+
+#  endif
+
+# else /* !__uClinux__ */
+
+	/* Versions for Linux */
+
+	.macro PICLEA sym, reg
+	movel	#_GLOBAL_OFFSET_TABLE_@GOTPC, \reg
+	lea	(-6, pc, \reg), \reg
+	movel	\sym@GOT(\reg), \reg
+	.endm
+
+	.macro PICPEA sym, areg
+	movel	#_GLOBAL_OFFSET_TABLE_@GOTPC, \areg
+	lea	(-6, pc, \areg), \areg
+	movel	\sym@GOT(\areg), sp@-
+	.endm
+
+	.macro PICCALL addr
+#if defined (__mcoldfire__) && !defined (__mcfisab__) && !defined (__mcfisac__)
+	lea	\addr-.-8,a0
+	jsr	pc@(a0)
+#else
+	jbsr	\addr
+#endif
+	.endm
+
+	.macro PICJUMP addr
+	/* ISA C has no bra.l instruction, and since this assembly file
+	   gets assembled into multiple object files, we avoid the
+	   bra instruction entirely.  */
+#if defined (__mcoldfire__) && !defined (__mcfisab__)
+	lea	\addr-.-8,a0
+	jmp	pc@(a0)
+#else
+	bra	\addr
+#endif
+	.endm
+
+# endif
+#endif /* __PIC__ */
+
+
+#ifdef L_floatex
+
+| This is an attempt at a decent floating point (single, double and 
+| extended double) code for the GNU C compiler. It should be easy to
+| adapt to other compilers (but beware of the local labels!).
+
+| Starting date: 21 October, 1990
+
+| It is convenient to introduce the notation (s,e,f) for a floating point
+| number, where s=sign, e=exponent, f=fraction. We will call a floating
+| point number fpn to abbreviate, independently of the precision.
+| Let MAX_EXP be in each case the maximum exponent (255 for floats, 1023 
+| for doubles and 16383 for long doubles). We then have the following 
+| different cases:
+|  1. Normalized fpns have 0 < e < MAX_EXP. They correspond to 
+|     (-1)^s x 1.f x 2^(e-bias-1).
+|  2. Denormalized fpns have e=0. They correspond to numbers of the form
+|     (-1)^s x 0.f x 2^(-bias).
+|  3. +/-INFINITY have e=MAX_EXP, f=0.
+|  4. Quiet NaN (Not a Number) have all bits set.
+|  5. Signaling NaN (Not a Number) have s=0, e=MAX_EXP, f=1.
+
+|=============================================================================
+|                                  exceptions
+|=============================================================================
+
+| This is the floating point condition code register (_fpCCR):
+|
+| struct {
+|   short _exception_bits;	
+|   short _trap_enable_bits;	
+|   short _sticky_bits;
+|   short _rounding_mode;
+|   short _format;
+|   short _last_operation;
+|   union {
+|     float sf;
+|     double df;
+|   } _operand1;
+|   union {
+|     float sf;
+|     double df;
+|   } _operand2;
+| } _fpCCR;
+
+	.data
+	.even
+
+	.globl	SYM (_fpCCR)
+	
+SYM (_fpCCR):
+__exception_bits:
+	.word	0
+__trap_enable_bits:
+	.word	0
+__sticky_bits:
+	.word	0
+__rounding_mode:
+	.word	ROUND_TO_NEAREST
+__format:
+	.word	NIL
+__last_operation:
+	.word	NOOP
+__operand1:
+	.long	0
+	.long	0
+__operand2:
+	.long 	0
+	.long	0
+
+| Offsets:
+EBITS  = __exception_bits - SYM (_fpCCR)
+TRAPE  = __trap_enable_bits - SYM (_fpCCR)
+STICK  = __sticky_bits - SYM (_fpCCR)
+ROUND  = __rounding_mode - SYM (_fpCCR)
+FORMT  = __format - SYM (_fpCCR)
+LASTO  = __last_operation - SYM (_fpCCR)
+OPER1  = __operand1 - SYM (_fpCCR)
+OPER2  = __operand2 - SYM (_fpCCR)
+
+| The following exception types are supported:
+INEXACT_RESULT 		= 0x0001
+UNDERFLOW 		= 0x0002
+OVERFLOW 		= 0x0004
+DIVIDE_BY_ZERO 		= 0x0008
+INVALID_OPERATION 	= 0x0010
+
+| The allowed rounding modes are:
+UNKNOWN           = -1
+ROUND_TO_NEAREST  = 0 | round result to nearest representable value
+ROUND_TO_ZERO     = 1 | round result towards zero
+ROUND_TO_PLUS     = 2 | round result towards plus infinity
+ROUND_TO_MINUS    = 3 | round result towards minus infinity
+
+| The allowed values of format are:
+NIL          = 0
+SINGLE_FLOAT = 1
+DOUBLE_FLOAT = 2
+LONG_FLOAT   = 3
+
+| The allowed values for the last operation are:
+NOOP         = 0
+ADD          = 1
+MULTIPLY     = 2
+DIVIDE       = 3
+NEGATE       = 4
+COMPARE      = 5
+EXTENDSFDF   = 6
+TRUNCDFSF    = 7
+
+|=============================================================================
+|                           __clear_sticky_bits
+|=============================================================================
+
+| The sticky bits are normally not cleared (thus the name), whereas the 
+| exception type and exception value reflect the last computation. 
+| This routine is provided to clear them (you can also write to _fpCCR,
+| since it is globally visible).
+
+	.globl  SYM (__clear_sticky_bit)
+
+	.text
+	.even
+
+| void __clear_sticky_bits(void);
+SYM (__clear_sticky_bit):		
+	PICLEA	SYM (_fpCCR),a0
+#ifndef __mcoldfire__
+	movew	IMM (0),a0@(STICK)
+#else
+	clr.w	a0@(STICK)
+#endif
+	rts
+
+|=============================================================================
+|                           $_exception_handler
+|=============================================================================
+
+	.globl  $_exception_handler
+
+	.text
+	.even
+
+| This is the common exit point if an exception occurs.
+| NOTE: it is NOT callable from C!
+| It expects the exception type in d7, the format (SINGLE_FLOAT,
+| DOUBLE_FLOAT or LONG_FLOAT) in d6, and the last operation code in d5.
+| It sets the corresponding exception and sticky bits, and the format. 
+| Depending on the format if fills the corresponding slots for the 
+| operands which produced the exception (all this information is provided
+| so if you write your own exception handlers you have enough information
+| to deal with the problem).
+| Then checks to see if the corresponding exception is trap-enabled, 
+| in which case it pushes the address of _fpCCR and traps through 
+| trap FPTRAP (15 for the moment).
+
+FPTRAP = 15
+
+$_exception_handler:
+	PICLEA	SYM (_fpCCR),a0
+	movew	d7,a0@(EBITS)	| set __exception_bits
+#ifndef __mcoldfire__
+	orw	d7,a0@(STICK)	| and __sticky_bits
+#else
+	movew	a0@(STICK),d4
+	orl	d7,d4
+	movew	d4,a0@(STICK)
+#endif
+	movew	d6,a0@(FORMT)	| and __format
+	movew	d5,a0@(LASTO)	| and __last_operation
+
+| Now put the operands in place:
+#ifndef __mcoldfire__
+	cmpw	IMM (SINGLE_FLOAT),d6
+#else
+	cmpl	IMM (SINGLE_FLOAT),d6
+#endif
+	beq	1f
+	movel	a6@(8),a0@(OPER1)
+	movel	a6@(12),a0@(OPER1+4)
+	movel	a6@(16),a0@(OPER2)
+	movel	a6@(20),a0@(OPER2+4)
+	bra	2f
+1:	movel	a6@(8),a0@(OPER1)
+	movel	a6@(12),a0@(OPER2)
+2:
+| And check whether the exception is trap-enabled:
+#ifndef __mcoldfire__
+	andw	a0@(TRAPE),d7	| is exception trap-enabled?
+#else
+	clrl	d6
+	movew	a0@(TRAPE),d6
+	andl	d6,d7
+#endif
+	beq	1f		| no, exit
+	PICPEA	SYM (_fpCCR),a1	| yes, push address of _fpCCR
+	trap	IMM (FPTRAP)	| and trap
+#ifndef __mcoldfire__
+1:	moveml	sp@+,d2-d7	| restore data registers
+#else
+1:	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6		| and return
+	rts
+#endif /* L_floatex */
+
+#ifdef  L_mulsi3
+	.text
+	FUNC(__mulsi3)
+	.globl	SYM (__mulsi3)
+SYM (__mulsi3):
+	movew	sp@(4), d0	/* x0 -> d0 */
+	muluw	sp@(10), d0	/* x0*y1 */
+	movew	sp@(6), d1	/* x1 -> d1 */
+	muluw	sp@(8), d1	/* x1*y0 */
+#ifndef __mcoldfire__
+	addw	d1, d0
+#else
+	addl	d1, d0
+#endif
+	swap	d0
+	clrw	d0
+	movew	sp@(6), d1	/* x1 -> d1 */
+	muluw	sp@(10), d1	/* x1*y1 */
+	addl	d1, d0
+
+	rts
+#endif /* L_mulsi3 */
+
+#ifdef  L_udivsi3
+	.text
+	FUNC(__udivsi3)
+	.globl	SYM (__udivsi3)
+SYM (__udivsi3):
+#ifndef __mcoldfire__
+	movel	d2, sp@-
+	movel	sp@(12), d1	/* d1 = divisor */
+	movel	sp@(8), d0	/* d0 = dividend */
+
+	cmpl	IMM (0x10000), d1 /* divisor >= 2 ^ 16 ?   */
+	jcc	L3		/* then try next algorithm */
+	movel	d0, d2
+	clrw	d2
+	swap	d2
+	divu	d1, d2          /* high quotient in lower word */
+	movew	d2, d0		/* save high quotient */
+	swap	d0
+	movew	sp@(10), d2	/* get low dividend + high rest */
+	divu	d1, d2		/* low quotient */
+	movew	d2, d0
+	jra	L6
+
+L3:	movel	d1, d2		/* use d2 as divisor backup */
+L4:	lsrl	IMM (1), d1	/* shift divisor */
+	lsrl	IMM (1), d0	/* shift dividend */
+	cmpl	IMM (0x10000), d1 /* still divisor >= 2 ^ 16 ?  */
+	jcc	L4
+	divu	d1, d0		/* now we have 16-bit divisor */
+	andl	IMM (0xffff), d0 /* mask out divisor, ignore remainder */
+
+/* Multiply the 16-bit tentative quotient with the 32-bit divisor.  Because of
+   the operand ranges, this might give a 33-bit product.  If this product is
+   greater than the dividend, the tentative quotient was too large. */
+	movel	d2, d1
+	mulu	d0, d1		/* low part, 32 bits */
+	swap	d2
+	mulu	d0, d2		/* high part, at most 17 bits */
+	swap	d2		/* align high part with low part */
+	tstw	d2		/* high part 17 bits? */
+	jne	L5		/* if 17 bits, quotient was too large */
+	addl	d2, d1		/* add parts */
+	jcs	L5		/* if sum is 33 bits, quotient was too large */
+	cmpl	sp@(8), d1	/* compare the sum with the dividend */
+	jls	L6		/* if sum > dividend, quotient was too large */
+L5:	subql	IMM (1), d0	/* adjust quotient */
+
+L6:	movel	sp@+, d2
+	rts
+
+#else /* __mcoldfire__ */
+
+/* ColdFire implementation of non-restoring division algorithm from
+   Hennessy & Patterson, Appendix A. */
+	link	a6,IMM (-12)
+	moveml	d2-d4,sp@
+	movel	a6@(8),d0
+	movel	a6@(12),d1
+	clrl	d2		| clear p
+	moveq	IMM (31),d4
+L1:	addl	d0,d0		| shift reg pair (p,a) one bit left
+	addxl	d2,d2
+	movl	d2,d3		| subtract b from p, store in tmp.
+	subl	d1,d3
+	jcs	L2		| if no carry,
+	bset	IMM (0),d0	| set the low order bit of a to 1,
+	movl	d3,d2		| and store tmp in p.
+L2:	subql	IMM (1),d4
+	jcc	L1
+	moveml	sp@,d2-d4	| restore data registers
+	unlk	a6		| and return
+	rts
+#endif /* __mcoldfire__ */
+
+#endif /* L_udivsi3 */
+
+#ifdef  L_divsi3
+	.text
+	FUNC(__divsi3)
+	.globl	SYM (__divsi3)
+SYM (__divsi3):
+	movel	d2, sp@-
+
+	moveq	IMM (1), d2	/* sign of result stored in d2 (=1 or =-1) */
+	movel	sp@(12), d1	/* d1 = divisor */
+	jpl	L1
+	negl	d1
+#ifndef __mcoldfire__
+	negb	d2		/* change sign because divisor <0  */
+#else
+	negl	d2		/* change sign because divisor <0  */
+#endif
+L1:	movel	sp@(8), d0	/* d0 = dividend */
+	jpl	L2
+	negl	d0
+#ifndef __mcoldfire__
+	negb	d2
+#else
+	negl	d2
+#endif
+
+L2:	movel	d1, sp@-
+	movel	d0, sp@-
+	PICCALL	SYM (__udivsi3)	/* divide abs(dividend) by abs(divisor) */
+	addql	IMM (8), sp
+
+	tstb	d2
+	jpl	L3
+	negl	d0
+
+L3:	movel	sp@+, d2
+	rts
+#endif /* L_divsi3 */
+
+#ifdef  L_umodsi3
+	.text
+	FUNC(__umodsi3)
+	.globl	SYM (__umodsi3)
+SYM (__umodsi3):
+	movel	sp@(8), d1	/* d1 = divisor */
+	movel	sp@(4), d0	/* d0 = dividend */
+	movel	d1, sp@-
+	movel	d0, sp@-
+	PICCALL	SYM (__udivsi3)
+	addql	IMM (8), sp
+	movel	sp@(8), d1	/* d1 = divisor */
+#ifndef __mcoldfire__
+	movel	d1, sp@-
+	movel	d0, sp@-
+	PICCALL	SYM (__mulsi3)	/* d0 = (a/b)*b */
+	addql	IMM (8), sp
+#else
+	mulsl	d1,d0
+#endif
+	movel	sp@(4), d1	/* d1 = dividend */
+	subl	d0, d1		/* d1 = a - (a/b)*b */
+	movel	d1, d0
+	rts
+#endif /* L_umodsi3 */
+
+#ifdef  L_modsi3
+	.text
+	FUNC(__modsi3)
+	.globl	SYM (__modsi3)
+SYM (__modsi3):
+	movel	sp@(8), d1	/* d1 = divisor */
+	movel	sp@(4), d0	/* d0 = dividend */
+	movel	d1, sp@-
+	movel	d0, sp@-
+	PICCALL	SYM (__divsi3)
+	addql	IMM (8), sp
+	movel	sp@(8), d1	/* d1 = divisor */
+#ifndef __mcoldfire__
+	movel	d1, sp@-
+	movel	d0, sp@-
+	PICCALL	SYM (__mulsi3)	/* d0 = (a/b)*b */
+	addql	IMM (8), sp
+#else
+	mulsl	d1,d0
+#endif
+	movel	sp@(4), d1	/* d1 = dividend */
+	subl	d0, d1		/* d1 = a - (a/b)*b */
+	movel	d1, d0
+	rts
+#endif /* L_modsi3 */
+
+
+#ifdef  L_double
+
+	.globl	SYM (_fpCCR)
+	.globl  $_exception_handler
+
+QUIET_NaN      = 0xffffffff
+
+D_MAX_EXP      = 0x07ff
+D_BIAS         = 1022
+DBL_MAX_EXP    = D_MAX_EXP - D_BIAS
+DBL_MIN_EXP    = 1 - D_BIAS
+DBL_MANT_DIG   = 53
+
+INEXACT_RESULT 		= 0x0001
+UNDERFLOW 		= 0x0002
+OVERFLOW 		= 0x0004
+DIVIDE_BY_ZERO 		= 0x0008
+INVALID_OPERATION 	= 0x0010
+
+DOUBLE_FLOAT = 2
+
+NOOP         = 0
+ADD          = 1
+MULTIPLY     = 2
+DIVIDE       = 3
+NEGATE       = 4
+COMPARE      = 5
+EXTENDSFDF   = 6
+TRUNCDFSF    = 7
+
+UNKNOWN           = -1
+ROUND_TO_NEAREST  = 0 | round result to nearest representable value
+ROUND_TO_ZERO     = 1 | round result towards zero
+ROUND_TO_PLUS     = 2 | round result towards plus infinity
+ROUND_TO_MINUS    = 3 | round result towards minus infinity
+
+| Entry points:
+
+	.globl SYM (__adddf3)
+	.globl SYM (__subdf3)
+	.globl SYM (__muldf3)
+	.globl SYM (__divdf3)
+	.globl SYM (__negdf2)
+	.globl SYM (__cmpdf2)
+	.globl SYM (__cmpdf2_internal)
+	.hidden SYM (__cmpdf2_internal)
+
+	.text
+	.even
+
+| These are common routines to return and signal exceptions.	
+
+Ld$den:
+| Return and signal a denormalized number
+	orl	d7,d0
+	movew	IMM (INEXACT_RESULT+UNDERFLOW),d7
+	moveq	IMM (DOUBLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+Ld$infty:
+Ld$overflow:
+| Return a properly signed INFINITY and set the exception flags 
+	movel	IMM (0x7ff00000),d0
+	movel	IMM (0),d1
+	orl	d7,d0
+	movew	IMM (INEXACT_RESULT+OVERFLOW),d7
+	moveq	IMM (DOUBLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+Ld$underflow:
+| Return 0 and set the exception flags 
+	movel	IMM (0),d0
+	movel	d0,d1
+	movew	IMM (INEXACT_RESULT+UNDERFLOW),d7
+	moveq	IMM (DOUBLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+Ld$inop:
+| Return a quiet NaN and set the exception flags
+	movel	IMM (QUIET_NaN),d0
+	movel	d0,d1
+	movew	IMM (INEXACT_RESULT+INVALID_OPERATION),d7
+	moveq	IMM (DOUBLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+Ld$div$0:
+| Return a properly signed INFINITY and set the exception flags
+	movel	IMM (0x7ff00000),d0
+	movel	IMM (0),d1
+	orl	d7,d0
+	movew	IMM (INEXACT_RESULT+DIVIDE_BY_ZERO),d7
+	moveq	IMM (DOUBLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+|=============================================================================
+|=============================================================================
+|                         double precision routines
+|=============================================================================
+|=============================================================================
+
+| A double precision floating point number (double) has the format:
+|
+| struct _double {
+|  unsigned int sign      : 1;  /* sign bit */ 
+|  unsigned int exponent  : 11; /* exponent, shifted by 126 */
+|  unsigned int fraction  : 52; /* fraction */
+| } double;
+| 
+| Thus sizeof(double) = 8 (64 bits). 
+|
+| All the routines are callable from C programs, and return the result 
+| in the register pair d0-d1. They also preserve all registers except 
+| d0-d1 and a0-a1.
+
+|=============================================================================
+|                              __subdf3
+|=============================================================================
+
+| double __subdf3(double, double);
+	FUNC(__subdf3)
+SYM (__subdf3):
+	bchg	IMM (31),sp@(12) | change sign of second operand
+				| and fall through, so we always add
+|=============================================================================
+|                              __adddf3
+|=============================================================================
+
+| double __adddf3(double, double);
+	FUNC(__adddf3)
+SYM (__adddf3):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)	| everything will be done in registers
+	moveml	d2-d7,sp@-	| save all data registers and a2 (but d0-d1)
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	movel	a6@(8),d0	| get first operand
+	movel	a6@(12),d1	| 
+	movel	a6@(16),d2	| get second operand
+	movel	a6@(20),d3	| 
+
+	movel	d0,d7		| get d0's sign bit in d7 '
+	addl	d1,d1		| check and clear sign bit of a, and gain one
+	addxl	d0,d0		| bit of extra precision
+	beq	Ladddf$b	| if zero return second operand
+
+	movel	d2,d6		| save sign in d6 
+	addl	d3,d3		| get rid of sign bit and gain one bit of
+	addxl	d2,d2		| extra precision
+	beq	Ladddf$a	| if zero return first operand
+
+	andl	IMM (0x80000000),d7 | isolate a's sign bit '
+        swap	d6		| and also b's sign bit '
+#ifndef __mcoldfire__
+	andw	IMM (0x8000),d6	|
+	orw	d6,d7		| and combine them into d7, so that a's sign '
+				| bit is in the high word and b's is in the '
+				| low word, so d6 is free to be used
+#else
+	andl	IMM (0x8000),d6
+	orl	d6,d7
+#endif
+	movel	d7,a0		| now save d7 into a0, so d7 is free to
+                		| be used also
+
+| Get the exponents and check for denormalized and/or infinity.
+
+	movel	IMM (0x001fffff),d6 | mask for the fraction
+	movel	IMM (0x00200000),d7 | mask to put hidden bit back
+
+	movel	d0,d4		| 
+	andl	d6,d0		| get fraction in d0
+	notl	d6		| make d6 into mask for the exponent
+	andl	d6,d4		| get exponent in d4
+	beq	Ladddf$a$den	| branch if a is denormalized
+	cmpl	d6,d4		| check for INFINITY or NaN
+	beq	Ladddf$nf       | 
+	orl	d7,d0		| and put hidden bit back
+Ladddf$1:
+	swap	d4		| shift right exponent so that it starts
+#ifndef __mcoldfire__
+	lsrw	IMM (5),d4	| in bit 0 and not bit 20
+#else
+	lsrl	IMM (5),d4	| in bit 0 and not bit 20
+#endif
+| Now we have a's exponent in d4 and fraction in d0-d1 '
+	movel	d2,d5		| save b to get exponent
+	andl	d6,d5		| get exponent in d5
+	beq	Ladddf$b$den	| branch if b is denormalized
+	cmpl	d6,d5		| check for INFINITY or NaN
+	beq	Ladddf$nf
+	notl	d6		| make d6 into mask for the fraction again
+	andl	d6,d2		| and get fraction in d2
+	orl	d7,d2		| and put hidden bit back
+Ladddf$2:
+	swap	d5		| shift right exponent so that it starts
+#ifndef __mcoldfire__
+	lsrw	IMM (5),d5	| in bit 0 and not bit 20
+#else
+	lsrl	IMM (5),d5	| in bit 0 and not bit 20
+#endif
+
+| Now we have b's exponent in d5 and fraction in d2-d3. '
+
+| The situation now is as follows: the signs are combined in a0, the 
+| numbers are in d0-d1 (a) and d2-d3 (b), and the exponents in d4 (a)
+| and d5 (b). To do the rounding correctly we need to keep all the
+| bits until the end, so we need to use d0-d1-d2-d3 for the first number
+| and d4-d5-d6-d7 for the second. To do this we store (temporarily) the
+| exponents in a2-a3.
+
+#ifndef __mcoldfire__
+	moveml	a2-a3,sp@-	| save the address registers
+#else
+	movel	a2,sp@-	
+	movel	a3,sp@-	
+	movel	a4,sp@-	
+#endif
+
+	movel	d4,a2		| save the exponents
+	movel	d5,a3		| 
+
+	movel	IMM (0),d7	| and move the numbers around
+	movel	d7,d6		|
+	movel	d3,d5		|
+	movel	d2,d4		|
+	movel	d7,d3		|
+	movel	d7,d2		|
+
+| Here we shift the numbers until the exponents are the same, and put 
+| the largest exponent in a2.
+#ifndef __mcoldfire__
+	exg	d4,a2		| get exponents back
+	exg	d5,a3		|
+	cmpw	d4,d5		| compare the exponents
+#else
+	movel	d4,a4		| get exponents back
+	movel	a2,d4
+	movel	a4,a2
+	movel	d5,a4
+	movel	a3,d5
+	movel	a4,a3
+	cmpl	d4,d5		| compare the exponents
+#endif
+	beq	Ladddf$3	| if equal don't shift '
+	bhi	9f		| branch if second exponent is higher
+
+| Here we have a's exponent larger than b's, so we have to shift b. We do 
+| this by using as counter d2:
+1:	movew	d4,d2		| move largest exponent to d2
+#ifndef __mcoldfire__
+	subw	d5,d2		| and subtract second exponent
+	exg	d4,a2		| get back the longs we saved
+	exg	d5,a3		|
+#else
+	subl	d5,d2		| and subtract second exponent
+	movel	d4,a4		| get back the longs we saved
+	movel	a2,d4
+	movel	a4,a2
+	movel	d5,a4
+	movel	a3,d5
+	movel	a4,a3
+#endif
+| if difference is too large we don't shift (actually, we can just exit) '
+#ifndef __mcoldfire__
+	cmpw	IMM (DBL_MANT_DIG+2),d2
+#else
+	cmpl	IMM (DBL_MANT_DIG+2),d2
+#endif
+	bge	Ladddf$b$small
+#ifndef __mcoldfire__
+	cmpw	IMM (32),d2	| if difference >= 32, shift by longs
+#else
+	cmpl	IMM (32),d2	| if difference >= 32, shift by longs
+#endif
+	bge	5f
+2:
+#ifndef __mcoldfire__
+	cmpw	IMM (16),d2	| if difference >= 16, shift by words	
+#else
+	cmpl	IMM (16),d2	| if difference >= 16, shift by words	
+#endif
+	bge	6f
+	bra	3f		| enter dbra loop
+
+4:
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d4
+	roxrl	IMM (1),d5
+	roxrl	IMM (1),d6
+	roxrl	IMM (1),d7
+#else
+	lsrl	IMM (1),d7
+	btst	IMM (0),d6
+	beq	10f
+	bset	IMM (31),d7
+10:	lsrl	IMM (1),d6
+	btst	IMM (0),d5
+	beq	11f
+	bset	IMM (31),d6
+11:	lsrl	IMM (1),d5
+	btst	IMM (0),d4
+	beq	12f
+	bset	IMM (31),d5
+12:	lsrl	IMM (1),d4
+#endif
+3:
+#ifndef __mcoldfire__
+	dbra	d2,4b
+#else
+	subql	IMM (1),d2
+	bpl	4b	
+#endif
+	movel	IMM (0),d2
+	movel	d2,d3	
+	bra	Ladddf$4
+5:
+	movel	d6,d7
+	movel	d5,d6
+	movel	d4,d5
+	movel	IMM (0),d4
+#ifndef __mcoldfire__
+	subw	IMM (32),d2
+#else
+	subl	IMM (32),d2
+#endif
+	bra	2b
+6:
+	movew	d6,d7
+	swap	d7
+	movew	d5,d6
+	swap	d6
+	movew	d4,d5
+	swap	d5
+	movew	IMM (0),d4
+	swap	d4
+#ifndef __mcoldfire__
+	subw	IMM (16),d2
+#else
+	subl	IMM (16),d2
+#endif
+	bra	3b
+	
+9:
+#ifndef __mcoldfire__
+	exg	d4,d5
+	movew	d4,d6
+	subw	d5,d6		| keep d5 (largest exponent) in d4
+	exg	d4,a2
+	exg	d5,a3
+#else
+	movel	d5,d6
+	movel	d4,d5
+	movel	d6,d4
+	subl	d5,d6
+	movel	d4,a4
+	movel	a2,d4
+	movel	a4,a2
+	movel	d5,a4
+	movel	a3,d5
+	movel	a4,a3
+#endif
+| if difference is too large we don't shift (actually, we can just exit) '
+#ifndef __mcoldfire__
+	cmpw	IMM (DBL_MANT_DIG+2),d6
+#else
+	cmpl	IMM (DBL_MANT_DIG+2),d6
+#endif
+	bge	Ladddf$a$small
+#ifndef __mcoldfire__
+	cmpw	IMM (32),d6	| if difference >= 32, shift by longs
+#else
+	cmpl	IMM (32),d6	| if difference >= 32, shift by longs
+#endif
+	bge	5f
+2:
+#ifndef __mcoldfire__
+	cmpw	IMM (16),d6	| if difference >= 16, shift by words	
+#else
+	cmpl	IMM (16),d6	| if difference >= 16, shift by words	
+#endif
+	bge	6f
+	bra	3f		| enter dbra loop
+
+4:
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	roxrl	IMM (1),d2
+	roxrl	IMM (1),d3
+#else
+	lsrl	IMM (1),d3
+	btst	IMM (0),d2
+	beq	10f
+	bset	IMM (31),d3
+10:	lsrl	IMM (1),d2
+	btst	IMM (0),d1
+	beq	11f
+	bset	IMM (31),d2
+11:	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	12f
+	bset	IMM (31),d1
+12:	lsrl	IMM (1),d0
+#endif
+3:
+#ifndef __mcoldfire__
+	dbra	d6,4b
+#else
+	subql	IMM (1),d6
+	bpl	4b
+#endif
+	movel	IMM (0),d7
+	movel	d7,d6
+	bra	Ladddf$4
+5:
+	movel	d2,d3
+	movel	d1,d2
+	movel	d0,d1
+	movel	IMM (0),d0
+#ifndef __mcoldfire__
+	subw	IMM (32),d6
+#else
+	subl	IMM (32),d6
+#endif
+	bra	2b
+6:
+	movew	d2,d3
+	swap	d3
+	movew	d1,d2
+	swap	d2
+	movew	d0,d1
+	swap	d1
+	movew	IMM (0),d0
+	swap	d0
+#ifndef __mcoldfire__
+	subw	IMM (16),d6
+#else
+	subl	IMM (16),d6
+#endif
+	bra	3b
+Ladddf$3:
+#ifndef __mcoldfire__
+	exg	d4,a2	
+	exg	d5,a3
+#else
+	movel	d4,a4
+	movel	a2,d4
+	movel	a4,a2
+	movel	d5,a4
+	movel	a3,d5
+	movel	a4,a3
+#endif
+Ladddf$4:	
+| Now we have the numbers in d0--d3 and d4--d7, the exponent in a2, and
+| the signs in a4.
+
+| Here we have to decide whether to add or subtract the numbers:
+#ifndef __mcoldfire__
+	exg	d7,a0		| get the signs 
+	exg	d6,a3		| a3 is free to be used
+#else
+	movel	d7,a4
+	movel	a0,d7
+	movel	a4,a0
+	movel	d6,a4
+	movel	a3,d6
+	movel	a4,a3
+#endif
+	movel	d7,d6		|
+	movew	IMM (0),d7	| get a's sign in d7 '
+	swap	d6              |
+	movew	IMM (0),d6	| and b's sign in d6 '
+	eorl	d7,d6		| compare the signs
+	bmi	Lsubdf$0	| if the signs are different we have 
+				| to subtract
+#ifndef __mcoldfire__
+	exg	d7,a0		| else we add the numbers
+	exg	d6,a3		|
+#else
+	movel	d7,a4
+	movel	a0,d7
+	movel	a4,a0
+	movel	d6,a4
+	movel	a3,d6
+	movel	a4,a3
+#endif
+	addl	d7,d3		|
+	addxl	d6,d2		|
+	addxl	d5,d1		| 
+	addxl	d4,d0           |
+
+	movel	a2,d4		| return exponent to d4
+	movel	a0,d7		| 
+	andl	IMM (0x80000000),d7 | d7 now has the sign
+
+#ifndef __mcoldfire__
+	moveml	sp@+,a2-a3	
+#else
+	movel	sp@+,a4	
+	movel	sp@+,a3	
+	movel	sp@+,a2	
+#endif
+
+| Before rounding normalize so bit #DBL_MANT_DIG is set (we will consider
+| the case of denormalized numbers in the rounding routine itself).
+| As in the addition (not in the subtraction!) we could have set 
+| one more bit we check this:
+	btst	IMM (DBL_MANT_DIG+1),d0	
+	beq	1f
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	roxrl	IMM (1),d2
+	roxrl	IMM (1),d3
+	addw	IMM (1),d4
+#else
+	lsrl	IMM (1),d3
+	btst	IMM (0),d2
+	beq	10f
+	bset	IMM (31),d3
+10:	lsrl	IMM (1),d2
+	btst	IMM (0),d1
+	beq	11f
+	bset	IMM (31),d2
+11:	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	12f
+	bset	IMM (31),d1
+12:	lsrl	IMM (1),d0
+	addl	IMM (1),d4
+#endif
+1:
+	lea	pc@(Ladddf$5),a0 | to return from rounding routine
+	PICLEA	SYM (_fpCCR),a1	| check the rounding mode
+#ifdef __mcoldfire__
+	clrl	d6
+#endif
+	movew	a1@(6),d6	| rounding mode in d6
+	beq	Lround$to$nearest
+#ifndef __mcoldfire__
+	cmpw	IMM (ROUND_TO_PLUS),d6
+#else
+	cmpl	IMM (ROUND_TO_PLUS),d6
+#endif
+	bhi	Lround$to$minus
+	blt	Lround$to$zero
+	bra	Lround$to$plus
+Ladddf$5:
+| Put back the exponent and check for overflow
+#ifndef __mcoldfire__
+	cmpw	IMM (0x7ff),d4	| is the exponent big?
+#else
+	cmpl	IMM (0x7ff),d4	| is the exponent big?
+#endif
+	bge	1f
+	bclr	IMM (DBL_MANT_DIG-1),d0
+#ifndef __mcoldfire__
+	lslw	IMM (4),d4	| put exponent back into position
+#else
+	lsll	IMM (4),d4	| put exponent back into position
+#endif
+	swap	d0		| 
+#ifndef __mcoldfire__
+	orw	d4,d0		|
+#else
+	orl	d4,d0		|
+#endif
+	swap	d0		|
+	bra	Ladddf$ret
+1:
+	moveq	IMM (ADD),d5
+	bra	Ld$overflow
+
+Lsubdf$0:
+| Here we do the subtraction.
+#ifndef __mcoldfire__
+	exg	d7,a0		| put sign back in a0
+	exg	d6,a3		|
+#else
+	movel	d7,a4
+	movel	a0,d7
+	movel	a4,a0
+	movel	d6,a4
+	movel	a3,d6
+	movel	a4,a3
+#endif
+	subl	d7,d3		|
+	subxl	d6,d2		|
+	subxl	d5,d1		|
+	subxl	d4,d0		|
+	beq	Ladddf$ret$1	| if zero just exit
+	bpl	1f		| if positive skip the following
+	movel	a0,d7		|
+	bchg	IMM (31),d7	| change sign bit in d7
+	movel	d7,a0		|
+	negl	d3		|
+	negxl	d2		|
+	negxl	d1              | and negate result
+	negxl	d0              |
+1:	
+	movel	a2,d4		| return exponent to d4
+	movel	a0,d7
+	andl	IMM (0x80000000),d7 | isolate sign bit
+#ifndef __mcoldfire__
+	moveml	sp@+,a2-a3	|
+#else
+	movel	sp@+,a4
+	movel	sp@+,a3
+	movel	sp@+,a2
+#endif
+
+| Before rounding normalize so bit #DBL_MANT_DIG is set (we will consider
+| the case of denormalized numbers in the rounding routine itself).
+| As in the addition (not in the subtraction!) we could have set 
+| one more bit we check this:
+	btst	IMM (DBL_MANT_DIG+1),d0	
+	beq	1f
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	roxrl	IMM (1),d2
+	roxrl	IMM (1),d3
+	addw	IMM (1),d4
+#else
+	lsrl	IMM (1),d3
+	btst	IMM (0),d2
+	beq	10f
+	bset	IMM (31),d3
+10:	lsrl	IMM (1),d2
+	btst	IMM (0),d1
+	beq	11f
+	bset	IMM (31),d2
+11:	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	12f
+	bset	IMM (31),d1
+12:	lsrl	IMM (1),d0
+	addl	IMM (1),d4
+#endif
+1:
+	lea	pc@(Lsubdf$1),a0 | to return from rounding routine
+	PICLEA	SYM (_fpCCR),a1	| check the rounding mode
+#ifdef __mcoldfire__
+	clrl	d6
+#endif
+	movew	a1@(6),d6	| rounding mode in d6
+	beq	Lround$to$nearest
+#ifndef __mcoldfire__
+	cmpw	IMM (ROUND_TO_PLUS),d6
+#else
+	cmpl	IMM (ROUND_TO_PLUS),d6
+#endif
+	bhi	Lround$to$minus
+	blt	Lround$to$zero
+	bra	Lround$to$plus
+Lsubdf$1:
+| Put back the exponent and sign (we don't have overflow). '
+	bclr	IMM (DBL_MANT_DIG-1),d0	
+#ifndef __mcoldfire__
+	lslw	IMM (4),d4	| put exponent back into position
+#else
+	lsll	IMM (4),d4	| put exponent back into position
+#endif
+	swap	d0		| 
+#ifndef __mcoldfire__
+	orw	d4,d0		|
+#else
+	orl	d4,d0		|
+#endif
+	swap	d0		|
+	bra	Ladddf$ret
+
+| If one of the numbers was too small (difference of exponents >= 
+| DBL_MANT_DIG+1) we return the other (and now we don't have to '
+| check for finiteness or zero).
+Ladddf$a$small:
+#ifndef __mcoldfire__
+	moveml	sp@+,a2-a3	
+#else
+	movel	sp@+,a4
+	movel	sp@+,a3
+	movel	sp@+,a2
+#endif
+	movel	a6@(16),d0
+	movel	a6@(20),d1
+	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7	| restore data registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6		| and return
+	rts
+
+Ladddf$b$small:
+#ifndef __mcoldfire__
+	moveml	sp@+,a2-a3	
+#else
+	movel	sp@+,a4	
+	movel	sp@+,a3	
+	movel	sp@+,a2	
+#endif
+	movel	a6@(8),d0
+	movel	a6@(12),d1
+	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7	| restore data registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6		| and return
+	rts
+
+Ladddf$a$den:
+	movel	d7,d4		| d7 contains 0x00200000
+	bra	Ladddf$1
+
+Ladddf$b$den:
+	movel	d7,d5           | d7 contains 0x00200000
+	notl	d6
+	bra	Ladddf$2
+
+Ladddf$b:
+| Return b (if a is zero)
+	movel	d2,d0
+	movel	d3,d1
+	bne	1f			| Check if b is -0
+	cmpl	IMM (0x80000000),d0
+	bne	1f
+	andl	IMM (0x80000000),d7	| Use the sign of a
+	clrl	d0
+	bra	Ladddf$ret
+Ladddf$a:
+	movel	a6@(8),d0
+	movel	a6@(12),d1
+1:
+	moveq	IMM (ADD),d5
+| Check for NaN and +/-INFINITY.
+	movel	d0,d7         		|
+	andl	IMM (0x80000000),d7	|
+	bclr	IMM (31),d0		|
+	cmpl	IMM (0x7ff00000),d0	|
+	bge	2f			|
+	movel	d0,d0           	| check for zero, since we don't  '
+	bne	Ladddf$ret		| want to return -0 by mistake
+	bclr	IMM (31),d7		|
+	bra	Ladddf$ret		|
+2:
+	andl	IMM (0x000fffff),d0	| check for NaN (nonzero fraction)
+	orl	d1,d0			|
+	bne	Ld$inop         	|
+	bra	Ld$infty		|
+	
+Ladddf$ret$1:
+#ifndef __mcoldfire__
+	moveml	sp@+,a2-a3	| restore regs and exit
+#else
+	movel	sp@+,a4
+	movel	sp@+,a3
+	movel	sp@+,a2
+#endif
+
+Ladddf$ret:
+| Normal exit.
+	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+	orl	d7,d0		| put sign bit back
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+
+Ladddf$ret$den:
+| Return a denormalized number.
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0	| shift right once more
+	roxrl	IMM (1),d1	|
+#else
+	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	10f
+	bset	IMM (31),d1
+10:	lsrl	IMM (1),d0
+#endif
+	bra	Ladddf$ret
+
+Ladddf$nf:
+	moveq	IMM (ADD),d5
+| This could be faster but it is not worth the effort, since it is not
+| executed very often. We sacrifice speed for clarity here.
+	movel	a6@(8),d0	| get the numbers back (remember that we
+	movel	a6@(12),d1	| did some processing already)
+	movel	a6@(16),d2	| 
+	movel	a6@(20),d3	| 
+	movel	IMM (0x7ff00000),d4 | useful constant (INFINITY)
+	movel	d0,d7		| save sign bits
+	movel	d2,d6		| 
+	bclr	IMM (31),d0	| clear sign bits
+	bclr	IMM (31),d2	| 
+| We know that one of them is either NaN of +/-INFINITY
+| Check for NaN (if either one is NaN return NaN)
+	cmpl	d4,d0		| check first a (d0)
+	bhi	Ld$inop		| if d0 > 0x7ff00000 or equal and
+	bne	2f
+	tstl	d1		| d1 > 0, a is NaN
+	bne	Ld$inop		| 
+2:	cmpl	d4,d2		| check now b (d1)
+	bhi	Ld$inop		| 
+	bne	3f
+	tstl	d3		| 
+	bne	Ld$inop		| 
+3:
+| Now comes the check for +/-INFINITY. We know that both are (maybe not
+| finite) numbers, but we have to check if both are infinite whether we
+| are adding or subtracting them.
+	eorl	d7,d6		| to check sign bits
+	bmi	1f
+	andl	IMM (0x80000000),d7 | get (common) sign bit
+	bra	Ld$infty
+1:
+| We know one (or both) are infinite, so we test for equality between the
+| two numbers (if they are equal they have to be infinite both, so we
+| return NaN).
+	cmpl	d2,d0		| are both infinite?
+	bne	1f		| if d0 <> d2 they are not equal
+	cmpl	d3,d1		| if d0 == d2 test d3 and d1
+	beq	Ld$inop		| if equal return NaN
+1:	
+	andl	IMM (0x80000000),d7 | get a's sign bit '
+	cmpl	d4,d0		| test now for infinity
+	beq	Ld$infty	| if a is INFINITY return with this sign
+	bchg	IMM (31),d7	| else we know b is INFINITY and has
+	bra	Ld$infty	| the opposite sign
+
+|=============================================================================
+|                              __muldf3
+|=============================================================================
+
+| double __muldf3(double, double);
+	FUNC(__muldf3)
+SYM (__muldf3):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)
+	moveml	d2-d7,sp@-
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	movel	a6@(8),d0		| get a into d0-d1
+	movel	a6@(12),d1		| 
+	movel	a6@(16),d2		| and b into d2-d3
+	movel	a6@(20),d3		|
+	movel	d0,d7			| d7 will hold the sign of the product
+	eorl	d2,d7			|
+	andl	IMM (0x80000000),d7	|
+	movel	d7,a0			| save sign bit into a0 
+	movel	IMM (0x7ff00000),d7	| useful constant (+INFINITY)
+	movel	d7,d6			| another (mask for fraction)
+	notl	d6			|
+	bclr	IMM (31),d0		| get rid of a's sign bit '
+	movel	d0,d4			| 
+	orl	d1,d4			| 
+	beq	Lmuldf$a$0		| branch if a is zero
+	movel	d0,d4			|
+	bclr	IMM (31),d2		| get rid of b's sign bit '
+	movel	d2,d5			|
+	orl	d3,d5			| 
+	beq	Lmuldf$b$0		| branch if b is zero
+	movel	d2,d5			| 
+	cmpl	d7,d0			| is a big?
+	bhi	Lmuldf$inop		| if a is NaN return NaN
+	beq	Lmuldf$a$nf		| we still have to check d1 and b ...
+	cmpl	d7,d2			| now compare b with INFINITY
+	bhi	Lmuldf$inop		| is b NaN?
+	beq	Lmuldf$b$nf 		| we still have to check d3 ...
+| Here we have both numbers finite and nonzero (and with no sign bit).
+| Now we get the exponents into d4 and d5.
+	andl	d7,d4			| isolate exponent in d4
+	beq	Lmuldf$a$den		| if exponent zero, have denormalized
+	andl	d6,d0			| isolate fraction
+	orl	IMM (0x00100000),d0	| and put hidden bit back
+	swap	d4			| I like exponents in the first byte
+#ifndef __mcoldfire__
+	lsrw	IMM (4),d4		| 
+#else
+	lsrl	IMM (4),d4		| 
+#endif
+Lmuldf$1:			
+	andl	d7,d5			|
+	beq	Lmuldf$b$den		|
+	andl	d6,d2			|
+	orl	IMM (0x00100000),d2	| and put hidden bit back
+	swap	d5			|
+#ifndef __mcoldfire__
+	lsrw	IMM (4),d5		|
+#else
+	lsrl	IMM (4),d5		|
+#endif
+Lmuldf$2:				|
+#ifndef __mcoldfire__
+	addw	d5,d4			| add exponents
+	subw	IMM (D_BIAS+1),d4	| and subtract bias (plus one)
+#else
+	addl	d5,d4			| add exponents
+	subl	IMM (D_BIAS+1),d4	| and subtract bias (plus one)
+#endif
+
+| We are now ready to do the multiplication. The situation is as follows:
+| both a and b have bit 52 ( bit 20 of d0 and d2) set (even if they were 
+| denormalized to start with!), which means that in the product bit 104 
+| (which will correspond to bit 8 of the fourth long) is set.
+
+| Here we have to do the product.
+| To do it we have to juggle the registers back and forth, as there are not
+| enough to keep everything in them. So we use the address registers to keep
+| some intermediate data.
+
+#ifndef __mcoldfire__
+	moveml	a2-a3,sp@-	| save a2 and a3 for temporary use
+#else
+	movel	a2,sp@-
+	movel	a3,sp@-
+	movel	a4,sp@-
+#endif
+	movel	IMM (0),a2	| a2 is a null register
+	movel	d4,a3		| and a3 will preserve the exponent
+
+| First, shift d2-d3 so bit 20 becomes bit 31:
+#ifndef __mcoldfire__
+	rorl	IMM (5),d2	| rotate d2 5 places right
+	swap	d2		| and swap it
+	rorl	IMM (5),d3	| do the same thing with d3
+	swap	d3		|
+	movew	d3,d6		| get the rightmost 11 bits of d3
+	andw	IMM (0x07ff),d6	|
+	orw	d6,d2		| and put them into d2
+	andw	IMM (0xf800),d3	| clear those bits in d3
+#else
+	moveq	IMM (11),d7	| left shift d2 11 bits
+	lsll	d7,d2
+	movel	d3,d6		| get a copy of d3
+	lsll	d7,d3		| left shift d3 11 bits
+	andl	IMM (0xffe00000),d6 | get the top 11 bits of d3
+	moveq	IMM (21),d7	| right shift them 21 bits
+	lsrl	d7,d6
+	orl	d6,d2		| stick them at the end of d2
+#endif
+
+	movel	d2,d6		| move b into d6-d7
+	movel	d3,d7           | move a into d4-d5
+	movel	d0,d4           | and clear d0-d1-d2-d3 (to put result)
+	movel	d1,d5           |
+	movel	IMM (0),d3	|
+	movel	d3,d2           |
+	movel	d3,d1           |
+	movel	d3,d0	        |
+
+| We use a1 as counter:	
+	movel	IMM (DBL_MANT_DIG-1),a1		
+#ifndef __mcoldfire__
+	exg	d7,a1
+#else
+	movel	d7,a4
+	movel	a1,d7
+	movel	a4,a1
+#endif
+
+1:
+#ifndef __mcoldfire__
+	exg	d7,a1		| put counter back in a1
+#else
+	movel	d7,a4
+	movel	a1,d7
+	movel	a4,a1
+#endif
+	addl	d3,d3		| shift sum once left
+	addxl	d2,d2           |
+	addxl	d1,d1           |
+	addxl	d0,d0           |
+	addl	d7,d7		|
+	addxl	d6,d6		|
+	bcc	2f		| if bit clear skip the following
+#ifndef __mcoldfire__
+	exg	d7,a2		|
+#else
+	movel	d7,a4
+	movel	a2,d7
+	movel	a4,a2
+#endif
+	addl	d5,d3		| else add a to the sum
+	addxl	d4,d2		|
+	addxl	d7,d1		|
+	addxl	d7,d0		|
+#ifndef __mcoldfire__
+	exg	d7,a2		| 
+#else
+	movel	d7,a4
+	movel	a2,d7
+	movel	a4,a2
+#endif
+2:
+#ifndef __mcoldfire__
+	exg	d7,a1		| put counter in d7
+	dbf	d7,1b		| decrement and branch
+#else
+	movel	d7,a4
+	movel	a1,d7
+	movel	a4,a1
+	subql	IMM (1),d7
+	bpl	1b
+#endif
+
+	movel	a3,d4		| restore exponent
+#ifndef __mcoldfire__
+	moveml	sp@+,a2-a3
+#else
+	movel	sp@+,a4
+	movel	sp@+,a3
+	movel	sp@+,a2
+#endif
+
+| Now we have the product in d0-d1-d2-d3, with bit 8 of d0 set. The 
+| first thing to do now is to normalize it so bit 8 becomes bit 
+| DBL_MANT_DIG-32 (to do the rounding); later we will shift right.
+	swap	d0
+	swap	d1
+	movew	d1,d0
+	swap	d2
+	movew	d2,d1
+	swap	d3
+	movew	d3,d2
+	movew	IMM (0),d3
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	roxrl	IMM (1),d2
+	roxrl	IMM (1),d3
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	roxrl	IMM (1),d2
+	roxrl	IMM (1),d3
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	roxrl	IMM (1),d2
+	roxrl	IMM (1),d3
+#else
+	moveq	IMM (29),d6
+	lsrl	IMM (3),d3
+	movel	d2,d7
+	lsll	d6,d7
+	orl	d7,d3
+	lsrl	IMM (3),d2
+	movel	d1,d7
+	lsll	d6,d7
+	orl	d7,d2
+	lsrl	IMM (3),d1
+	movel	d0,d7
+	lsll	d6,d7
+	orl	d7,d1
+	lsrl	IMM (3),d0
+#endif
+	
+| Now round, check for over- and underflow, and exit.
+	movel	a0,d7		| get sign bit back into d7
+	moveq	IMM (MULTIPLY),d5
+
+	btst	IMM (DBL_MANT_DIG+1-32),d0
+	beq	Lround$exit
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	addw	IMM (1),d4
+#else
+	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	10f
+	bset	IMM (31),d1
+10:	lsrl	IMM (1),d0
+	addl	IMM (1),d4
+#endif
+	bra	Lround$exit
+
+Lmuldf$inop:
+	moveq	IMM (MULTIPLY),d5
+	bra	Ld$inop
+
+Lmuldf$b$nf:
+	moveq	IMM (MULTIPLY),d5
+	movel	a0,d7		| get sign bit back into d7
+	tstl	d3		| we know d2 == 0x7ff00000, so check d3
+	bne	Ld$inop		| if d3 <> 0 b is NaN
+	bra	Ld$overflow	| else we have overflow (since a is finite)
+
+Lmuldf$a$nf:
+	moveq	IMM (MULTIPLY),d5
+	movel	a0,d7		| get sign bit back into d7
+	tstl	d1		| we know d0 == 0x7ff00000, so check d1
+	bne	Ld$inop		| if d1 <> 0 a is NaN
+	bra	Ld$overflow	| else signal overflow
+
+| If either number is zero return zero, unless the other is +/-INFINITY or
+| NaN, in which case we return NaN.
+Lmuldf$b$0:
+	moveq	IMM (MULTIPLY),d5
+#ifndef __mcoldfire__
+	exg	d2,d0		| put b (==0) into d0-d1
+	exg	d3,d1		| and a (with sign bit cleared) into d2-d3
+	movel	a0,d0		| set result sign
+#else
+	movel	d0,d2		| put a into d2-d3
+	movel	d1,d3
+	movel	a0,d0		| put result zero into d0-d1
+	movq	IMM(0),d1
+#endif
+	bra	1f
+Lmuldf$a$0:
+	movel	a0,d0		| set result sign
+	movel	a6@(16),d2	| put b into d2-d3 again
+	movel	a6@(20),d3	|
+	bclr	IMM (31),d2	| clear sign bit
+1:	cmpl	IMM (0x7ff00000),d2 | check for non-finiteness
+	bge	Ld$inop		| in case NaN or +/-INFINITY return NaN
+	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+
+| If a number is denormalized we put an exponent of 1 but do not put the 
+| hidden bit back into the fraction; instead we shift left until bit 21
+| (the hidden bit) is set, adjusting the exponent accordingly. We do this
+| to ensure that the product of the fractions is close to 1.
+Lmuldf$a$den:
+	movel	IMM (1),d4
+	andl	d6,d0
+1:	addl	d1,d1           | shift a left until bit 20 is set
+	addxl	d0,d0		|
+#ifndef __mcoldfire__
+	subw	IMM (1),d4	| and adjust exponent
+#else
+	subl	IMM (1),d4	| and adjust exponent
+#endif
+	btst	IMM (20),d0	|
+	bne	Lmuldf$1        |
+	bra	1b
+
+Lmuldf$b$den:
+	movel	IMM (1),d5
+	andl	d6,d2
+1:	addl	d3,d3		| shift b left until bit 20 is set
+	addxl	d2,d2		|
+#ifndef __mcoldfire__
+	subw	IMM (1),d5	| and adjust exponent
+#else
+	subql	IMM (1),d5	| and adjust exponent
+#endif
+	btst	IMM (20),d2	|
+	bne	Lmuldf$2	|
+	bra	1b
+
+
+|=============================================================================
+|                              __divdf3
+|=============================================================================
+
+| double __divdf3(double, double);
+	FUNC(__divdf3)
+SYM (__divdf3):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)
+	moveml	d2-d7,sp@-
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	movel	a6@(8),d0	| get a into d0-d1
+	movel	a6@(12),d1	| 
+	movel	a6@(16),d2	| and b into d2-d3
+	movel	a6@(20),d3	|
+	movel	d0,d7		| d7 will hold the sign of the result
+	eorl	d2,d7		|
+	andl	IMM (0x80000000),d7
+	movel	d7,a0		| save sign into a0
+	movel	IMM (0x7ff00000),d7 | useful constant (+INFINITY)
+	movel	d7,d6		| another (mask for fraction)
+	notl	d6		|
+	bclr	IMM (31),d0	| get rid of a's sign bit '
+	movel	d0,d4		|
+	orl	d1,d4		|
+	beq	Ldivdf$a$0	| branch if a is zero
+	movel	d0,d4		|
+	bclr	IMM (31),d2	| get rid of b's sign bit '
+	movel	d2,d5		|
+	orl	d3,d5		|
+	beq	Ldivdf$b$0	| branch if b is zero
+	movel	d2,d5
+	cmpl	d7,d0		| is a big?
+	bhi	Ldivdf$inop	| if a is NaN return NaN
+	beq	Ldivdf$a$nf	| if d0 == 0x7ff00000 we check d1
+	cmpl	d7,d2		| now compare b with INFINITY 
+	bhi	Ldivdf$inop	| if b is NaN return NaN
+	beq	Ldivdf$b$nf	| if d2 == 0x7ff00000 we check d3
+| Here we have both numbers finite and nonzero (and with no sign bit).
+| Now we get the exponents into d4 and d5 and normalize the numbers to
+| ensure that the ratio of the fractions is around 1. We do this by
+| making sure that both numbers have bit #DBL_MANT_DIG-32-1 (hidden bit)
+| set, even if they were denormalized to start with.
+| Thus, the result will satisfy: 2 > result > 1/2.
+	andl	d7,d4		| and isolate exponent in d4
+	beq	Ldivdf$a$den	| if exponent is zero we have a denormalized
+	andl	d6,d0		| and isolate fraction
+	orl	IMM (0x00100000),d0 | and put hidden bit back
+	swap	d4		| I like exponents in the first byte
+#ifndef __mcoldfire__
+	lsrw	IMM (4),d4	| 
+#else
+	lsrl	IMM (4),d4	| 
+#endif
+Ldivdf$1:			| 
+	andl	d7,d5		|
+	beq	Ldivdf$b$den	|
+	andl	d6,d2		|
+	orl	IMM (0x00100000),d2
+	swap	d5		|
+#ifndef __mcoldfire__
+	lsrw	IMM (4),d5	|
+#else
+	lsrl	IMM (4),d5	|
+#endif
+Ldivdf$2:			|
+#ifndef __mcoldfire__
+	subw	d5,d4		| subtract exponents
+	addw	IMM (D_BIAS),d4	| and add bias
+#else
+	subl	d5,d4		| subtract exponents
+	addl	IMM (D_BIAS),d4	| and add bias
+#endif
+
+| We are now ready to do the division. We have prepared things in such a way
+| that the ratio of the fractions will be less than 2 but greater than 1/2.
+| At this point the registers in use are:
+| d0-d1	hold a (first operand, bit DBL_MANT_DIG-32=0, bit 
+| DBL_MANT_DIG-1-32=1)
+| d2-d3	hold b (second operand, bit DBL_MANT_DIG-32=1)
+| d4	holds the difference of the exponents, corrected by the bias
+| a0	holds the sign of the ratio
+
+| To do the rounding correctly we need to keep information about the
+| nonsignificant bits. One way to do this would be to do the division
+| using four registers; another is to use two registers (as originally
+| I did), but use a sticky bit to preserve information about the 
+| fractional part. Note that we can keep that info in a1, which is not
+| used.
+	movel	IMM (0),d6	| d6-d7 will hold the result
+	movel	d6,d7		| 
+	movel	IMM (0),a1	| and a1 will hold the sticky bit
+
+	movel	IMM (DBL_MANT_DIG-32+1),d5	
+	
+1:	cmpl	d0,d2		| is a < b?
+	bhi	3f		| if b > a skip the following
+	beq	4f		| if d0==d2 check d1 and d3
+2:	subl	d3,d1		| 
+	subxl	d2,d0		| a <-- a - b
+	bset	d5,d6		| set the corresponding bit in d6
+3:	addl	d1,d1		| shift a by 1
+	addxl	d0,d0		|
+#ifndef __mcoldfire__
+	dbra	d5,1b		| and branch back
+#else
+	subql	IMM (1), d5
+	bpl	1b
+#endif
+	bra	5f			
+4:	cmpl	d1,d3		| here d0==d2, so check d1 and d3
+	bhi	3b		| if d1 > d2 skip the subtraction
+	bra	2b		| else go do it
+5:
+| Here we have to start setting the bits in the second long.
+	movel	IMM (31),d5	| again d5 is counter
+
+1:	cmpl	d0,d2		| is a < b?
+	bhi	3f		| if b > a skip the following
+	beq	4f		| if d0==d2 check d1 and d3
+2:	subl	d3,d1		| 
+	subxl	d2,d0		| a <-- a - b
+	bset	d5,d7		| set the corresponding bit in d7
+3:	addl	d1,d1		| shift a by 1
+	addxl	d0,d0		|
+#ifndef __mcoldfire__
+	dbra	d5,1b		| and branch back
+#else
+	subql	IMM (1), d5
+	bpl	1b
+#endif
+	bra	5f			
+4:	cmpl	d1,d3		| here d0==d2, so check d1 and d3
+	bhi	3b		| if d1 > d2 skip the subtraction
+	bra	2b		| else go do it
+5:
+| Now go ahead checking until we hit a one, which we store in d2.
+	movel	IMM (DBL_MANT_DIG),d5
+1:	cmpl	d2,d0		| is a < b?
+	bhi	4f		| if b < a, exit
+	beq	3f		| if d0==d2 check d1 and d3
+2:	addl	d1,d1		| shift a by 1
+	addxl	d0,d0		|
+#ifndef __mcoldfire__
+	dbra	d5,1b		| and branch back
+#else
+	subql	IMM (1), d5
+	bpl	1b
+#endif
+	movel	IMM (0),d2	| here no sticky bit was found
+	movel	d2,d3
+	bra	5f			
+3:	cmpl	d1,d3		| here d0==d2, so check d1 and d3
+	bhi	2b		| if d1 > d2 go back
+4:
+| Here put the sticky bit in d2-d3 (in the position which actually corresponds
+| to it; if you don't do this the algorithm loses in some cases). '
+	movel	IMM (0),d2
+	movel	d2,d3
+#ifndef __mcoldfire__
+	subw	IMM (DBL_MANT_DIG),d5
+	addw	IMM (63),d5
+	cmpw	IMM (31),d5
+#else
+	subl	IMM (DBL_MANT_DIG),d5
+	addl	IMM (63),d5
+	cmpl	IMM (31),d5
+#endif
+	bhi	2f
+1:	bset	d5,d3
+	bra	5f
+#ifndef __mcoldfire__
+	subw	IMM (32),d5
+#else
+	subl	IMM (32),d5
+#endif
+2:	bset	d5,d2
+5:
+| Finally we are finished! Move the longs in the address registers to
+| their final destination:
+	movel	d6,d0
+	movel	d7,d1
+	movel	IMM (0),d3
+
+| Here we have finished the division, with the result in d0-d1-d2-d3, with
+| 2^21 <= d6 < 2^23. Thus bit 23 is not set, but bit 22 could be set.
+| If it is not, then definitely bit 21 is set. Normalize so bit 22 is
+| not set:
+	btst	IMM (DBL_MANT_DIG-32+1),d0
+	beq	1f
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	roxrl	IMM (1),d2
+	roxrl	IMM (1),d3
+	addw	IMM (1),d4
+#else
+	lsrl	IMM (1),d3
+	btst	IMM (0),d2
+	beq	10f
+	bset	IMM (31),d3
+10:	lsrl	IMM (1),d2
+	btst	IMM (0),d1
+	beq	11f
+	bset	IMM (31),d2
+11:	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	12f
+	bset	IMM (31),d1
+12:	lsrl	IMM (1),d0
+	addl	IMM (1),d4
+#endif
+1:
+| Now round, check for over- and underflow, and exit.
+	movel	a0,d7		| restore sign bit to d7
+	moveq	IMM (DIVIDE),d5
+	bra	Lround$exit
+
+Ldivdf$inop:
+	moveq	IMM (DIVIDE),d5
+	bra	Ld$inop
+
+Ldivdf$a$0:
+| If a is zero check to see whether b is zero also. In that case return
+| NaN; then check if b is NaN, and return NaN also in that case. Else
+| return a properly signed zero.
+	moveq	IMM (DIVIDE),d5
+	bclr	IMM (31),d2	|
+	movel	d2,d4		| 
+	orl	d3,d4		| 
+	beq	Ld$inop		| if b is also zero return NaN
+	cmpl	IMM (0x7ff00000),d2 | check for NaN
+	bhi	Ld$inop		| 
+	blt	1f		|
+	tstl	d3		|
+	bne	Ld$inop		|
+1:	movel	a0,d0		| else return signed zero
+	moveq	IMM(0),d1	| 
+	PICLEA	SYM (_fpCCR),a0	| clear exception flags
+	movew	IMM (0),a0@	|
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7	| 
+#else
+	moveml	sp@,d2-d7	| 
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6		| 
+	rts			| 	
+
+Ldivdf$b$0:
+	moveq	IMM (DIVIDE),d5
+| If we got here a is not zero. Check if a is NaN; in that case return NaN,
+| else return +/-INFINITY. Remember that a is in d0 with the sign bit 
+| cleared already.
+	movel	a0,d7		| put a's sign bit back in d7 '
+	cmpl	IMM (0x7ff00000),d0 | compare d0 with INFINITY
+	bhi	Ld$inop		| if larger it is NaN
+	tstl	d1		| 
+	bne	Ld$inop		| 
+	bra	Ld$div$0	| else signal DIVIDE_BY_ZERO
+
+Ldivdf$b$nf:
+	moveq	IMM (DIVIDE),d5
+| If d2 == 0x7ff00000 we have to check d3.
+	tstl	d3		|
+	bne	Ld$inop		| if d3 <> 0, b is NaN
+	bra	Ld$underflow	| else b is +/-INFINITY, so signal underflow
+
+Ldivdf$a$nf:
+	moveq	IMM (DIVIDE),d5
+| If d0 == 0x7ff00000 we have to check d1.
+	tstl	d1		|
+	bne	Ld$inop		| if d1 <> 0, a is NaN
+| If a is INFINITY we have to check b
+	cmpl	d7,d2		| compare b with INFINITY 
+	bge	Ld$inop		| if b is NaN or INFINITY return NaN
+	tstl	d3		|
+	bne	Ld$inop		| 
+	bra	Ld$overflow	| else return overflow
+
+| If a number is denormalized we put an exponent of 1 but do not put the 
+| bit back into the fraction.
+Ldivdf$a$den:
+	movel	IMM (1),d4
+	andl	d6,d0
+1:	addl	d1,d1		| shift a left until bit 20 is set
+	addxl	d0,d0
+#ifndef __mcoldfire__
+	subw	IMM (1),d4	| and adjust exponent
+#else
+	subl	IMM (1),d4	| and adjust exponent
+#endif
+	btst	IMM (DBL_MANT_DIG-32-1),d0
+	bne	Ldivdf$1
+	bra	1b
+
+Ldivdf$b$den:
+	movel	IMM (1),d5
+	andl	d6,d2
+1:	addl	d3,d3		| shift b left until bit 20 is set
+	addxl	d2,d2
+#ifndef __mcoldfire__
+	subw	IMM (1),d5	| and adjust exponent
+#else
+	subql	IMM (1),d5	| and adjust exponent
+#endif
+	btst	IMM (DBL_MANT_DIG-32-1),d2
+	bne	Ldivdf$2
+	bra	1b
+
+Lround$exit:
+| This is a common exit point for __muldf3 and __divdf3. When they enter
+| this point the sign of the result is in d7, the result in d0-d1, normalized
+| so that 2^21 <= d0 < 2^22, and the exponent is in the lower byte of d4.
+
+| First check for underlow in the exponent:
+#ifndef __mcoldfire__
+	cmpw	IMM (-DBL_MANT_DIG-1),d4		
+#else
+	cmpl	IMM (-DBL_MANT_DIG-1),d4		
+#endif
+	blt	Ld$underflow	
+| It could happen that the exponent is less than 1, in which case the 
+| number is denormalized. In this case we shift right and adjust the 
+| exponent until it becomes 1 or the fraction is zero (in the latter case 
+| we signal underflow and return zero).
+	movel	d7,a0		|
+	movel	IMM (0),d6	| use d6-d7 to collect bits flushed right
+	movel	d6,d7		| use d6-d7 to collect bits flushed right
+#ifndef __mcoldfire__
+	cmpw	IMM (1),d4	| if the exponent is less than 1 we 
+#else
+	cmpl	IMM (1),d4	| if the exponent is less than 1 we 
+#endif
+	bge	2f		| have to shift right (denormalize)
+1:
+#ifndef __mcoldfire__
+	addw	IMM (1),d4	| adjust the exponent
+	lsrl	IMM (1),d0	| shift right once 
+	roxrl	IMM (1),d1	|
+	roxrl	IMM (1),d2	|
+	roxrl	IMM (1),d3	|
+	roxrl	IMM (1),d6	| 
+	roxrl	IMM (1),d7	|
+	cmpw	IMM (1),d4	| is the exponent 1 already?
+#else
+	addl	IMM (1),d4	| adjust the exponent
+	lsrl	IMM (1),d7
+	btst	IMM (0),d6
+	beq	13f
+	bset	IMM (31),d7
+13:	lsrl	IMM (1),d6
+	btst	IMM (0),d3
+	beq	14f
+	bset	IMM (31),d6
+14:	lsrl	IMM (1),d3
+	btst	IMM (0),d2
+	beq	10f
+	bset	IMM (31),d3
+10:	lsrl	IMM (1),d2
+	btst	IMM (0),d1
+	beq	11f
+	bset	IMM (31),d2
+11:	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	12f
+	bset	IMM (31),d1
+12:	lsrl	IMM (1),d0
+	cmpl	IMM (1),d4	| is the exponent 1 already?
+#endif
+	beq	2f		| if not loop back
+	bra	1b              |
+	bra	Ld$underflow	| safety check, shouldn't execute '
+2:	orl	d6,d2		| this is a trick so we don't lose  '
+	orl	d7,d3		| the bits which were flushed right
+	movel	a0,d7		| get back sign bit into d7
+| Now call the rounding routine (which takes care of denormalized numbers):
+	lea	pc@(Lround$0),a0 | to return from rounding routine
+	PICLEA	SYM (_fpCCR),a1	| check the rounding mode
+#ifdef __mcoldfire__
+	clrl	d6
+#endif
+	movew	a1@(6),d6	| rounding mode in d6
+	beq	Lround$to$nearest
+#ifndef __mcoldfire__
+	cmpw	IMM (ROUND_TO_PLUS),d6
+#else
+	cmpl	IMM (ROUND_TO_PLUS),d6
+#endif
+	bhi	Lround$to$minus
+	blt	Lround$to$zero
+	bra	Lround$to$plus
+Lround$0:
+| Here we have a correctly rounded result (either normalized or denormalized).
+
+| Here we should have either a normalized number or a denormalized one, and
+| the exponent is necessarily larger or equal to 1 (so we don't have to  '
+| check again for underflow!). We have to check for overflow or for a 
+| denormalized number (which also signals underflow).
+| Check for overflow (i.e., exponent >= 0x7ff).
+#ifndef __mcoldfire__
+	cmpw	IMM (0x07ff),d4
+#else
+	cmpl	IMM (0x07ff),d4
+#endif
+	bge	Ld$overflow
+| Now check for a denormalized number (exponent==0):
+	movew	d4,d4
+	beq	Ld$den
+1:
+| Put back the exponents and sign and return.
+#ifndef __mcoldfire__
+	lslw	IMM (4),d4	| exponent back to fourth byte
+#else
+	lsll	IMM (4),d4	| exponent back to fourth byte
+#endif
+	bclr	IMM (DBL_MANT_DIG-32-1),d0
+	swap	d0		| and put back exponent
+#ifndef __mcoldfire__
+	orw	d4,d0		| 
+#else
+	orl	d4,d0		| 
+#endif
+	swap	d0		|
+	orl	d7,d0		| and sign also
+
+	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+
+|=============================================================================
+|                              __negdf2
+|=============================================================================
+
+| double __negdf2(double, double);
+	FUNC(__negdf2)
+SYM (__negdf2):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)
+	moveml	d2-d7,sp@-
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	moveq	IMM (NEGATE),d5
+	movel	a6@(8),d0	| get number to negate in d0-d1
+	movel	a6@(12),d1	|
+	bchg	IMM (31),d0	| negate
+	movel	d0,d2		| make a positive copy (for the tests)
+	bclr	IMM (31),d2	|
+	movel	d2,d4		| check for zero
+	orl	d1,d4		|
+	beq	2f		| if zero (either sign) return +zero
+	cmpl	IMM (0x7ff00000),d2 | compare to +INFINITY
+	blt	1f		| if finite, return
+	bhi	Ld$inop		| if larger (fraction not zero) is NaN
+	tstl	d1		| if d2 == 0x7ff00000 check d1
+	bne	Ld$inop		|
+	movel	d0,d7		| else get sign and return INFINITY
+	andl	IMM (0x80000000),d7
+	bra	Ld$infty		
+1:	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+2:	bclr	IMM (31),d0
+	bra	1b
+
+|=============================================================================
+|                              __cmpdf2
+|=============================================================================
+
+GREATER =  1
+LESS    = -1
+EQUAL   =  0
+
+| int __cmpdf2_internal(double, double, int);
+SYM (__cmpdf2_internal):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)
+	moveml	d2-d7,sp@- 	| save registers
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	moveq	IMM (COMPARE),d5
+	movel	a6@(8),d0	| get first operand
+	movel	a6@(12),d1	|
+	movel	a6@(16),d2	| get second operand
+	movel	a6@(20),d3	|
+| First check if a and/or b are (+/-) zero and in that case clear
+| the sign bit.
+	movel	d0,d6		| copy signs into d6 (a) and d7(b)
+	bclr	IMM (31),d0	| and clear signs in d0 and d2
+	movel	d2,d7		|
+	bclr	IMM (31),d2	|
+	cmpl	IMM (0x7ff00000),d0 | check for a == NaN
+	bhi	Lcmpd$inop		| if d0 > 0x7ff00000, a is NaN
+	beq	Lcmpdf$a$nf	| if equal can be INFINITY, so check d1
+	movel	d0,d4		| copy into d4 to test for zero
+	orl	d1,d4		|
+	beq	Lcmpdf$a$0	|
+Lcmpdf$0:
+	cmpl	IMM (0x7ff00000),d2 | check for b == NaN
+	bhi	Lcmpd$inop		| if d2 > 0x7ff00000, b is NaN
+	beq	Lcmpdf$b$nf	| if equal can be INFINITY, so check d3
+	movel	d2,d4		|
+	orl	d3,d4		|
+	beq	Lcmpdf$b$0	|
+Lcmpdf$1:
+| Check the signs
+	eorl	d6,d7
+	bpl	1f
+| If the signs are not equal check if a >= 0
+	tstl	d6
+	bpl	Lcmpdf$a$gt$b	| if (a >= 0 && b < 0) => a > b
+	bmi	Lcmpdf$b$gt$a	| if (a < 0 && b >= 0) => a < b
+1:
+| If the signs are equal check for < 0
+	tstl	d6
+	bpl	1f
+| If both are negative exchange them
+#ifndef __mcoldfire__
+	exg	d0,d2
+	exg	d1,d3
+#else
+	movel	d0,d7
+	movel	d2,d0
+	movel	d7,d2
+	movel	d1,d7
+	movel	d3,d1
+	movel	d7,d3
+#endif
+1:
+| Now that they are positive we just compare them as longs (does this also
+| work for denormalized numbers?).
+	cmpl	d0,d2
+	bhi	Lcmpdf$b$gt$a	| |b| > |a|
+	bne	Lcmpdf$a$gt$b	| |b| < |a|
+| If we got here d0 == d2, so we compare d1 and d3.
+	cmpl	d1,d3
+	bhi	Lcmpdf$b$gt$a	| |b| > |a|
+	bne	Lcmpdf$a$gt$b	| |b| < |a|
+| If we got here a == b.
+	movel	IMM (EQUAL),d0
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7 	| put back the registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+Lcmpdf$a$gt$b:
+	movel	IMM (GREATER),d0
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7 	| put back the registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+Lcmpdf$b$gt$a:
+	movel	IMM (LESS),d0
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7 	| put back the registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+
+Lcmpdf$a$0:	
+	bclr	IMM (31),d6
+	bra	Lcmpdf$0
+Lcmpdf$b$0:
+	bclr	IMM (31),d7
+	bra	Lcmpdf$1
+
+Lcmpdf$a$nf:
+	tstl	d1
+	bne	Ld$inop
+	bra	Lcmpdf$0
+
+Lcmpdf$b$nf:
+	tstl	d3
+	bne	Ld$inop
+	bra	Lcmpdf$1
+
+Lcmpd$inop:
+	movl	a6@(24),d0
+	moveq	IMM (INEXACT_RESULT+INVALID_OPERATION),d7
+	moveq	IMM (DOUBLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+| int __cmpdf2(double, double);
+	FUNC(__cmpdf2)
+SYM (__cmpdf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(20),sp@-
+	movl	a6@(16),sp@-
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpdf2_internal)
+	unlk	a6
+	rts
+
+|=============================================================================
+|                           rounding routines
+|=============================================================================
+
+| The rounding routines expect the number to be normalized in registers
+| d0-d1-d2-d3, with the exponent in register d4. They assume that the 
+| exponent is larger or equal to 1. They return a properly normalized number
+| if possible, and a denormalized number otherwise. The exponent is returned
+| in d4.
+
+Lround$to$nearest:
+| We now normalize as suggested by D. Knuth ("Seminumerical Algorithms"):
+| Here we assume that the exponent is not too small (this should be checked
+| before entering the rounding routine), but the number could be denormalized.
+
+| Check for denormalized numbers:
+1:	btst	IMM (DBL_MANT_DIG-32),d0
+	bne	2f		| if set the number is normalized
+| Normalize shifting left until bit #DBL_MANT_DIG-32 is set or the exponent 
+| is one (remember that a denormalized number corresponds to an 
+| exponent of -D_BIAS+1).
+#ifndef __mcoldfire__
+	cmpw	IMM (1),d4	| remember that the exponent is at least one
+#else
+	cmpl	IMM (1),d4	| remember that the exponent is at least one
+#endif
+ 	beq	2f		| an exponent of one means denormalized
+	addl	d3,d3		| else shift and adjust the exponent
+	addxl	d2,d2		|
+	addxl	d1,d1		|
+	addxl	d0,d0		|
+#ifndef __mcoldfire__
+	dbra	d4,1b		|
+#else
+	subql	IMM (1), d4
+	bpl	1b
+#endif
+2:
+| Now round: we do it as follows: after the shifting we can write the
+| fraction part as f + delta, where 1 < f < 2^25, and 0 <= delta <= 2.
+| If delta < 1, do nothing. If delta > 1, add 1 to f. 
+| If delta == 1, we make sure the rounded number will be even (odd?) 
+| (after shifting).
+	btst	IMM (0),d1	| is delta < 1?
+	beq	2f		| if so, do not do anything
+	orl	d2,d3		| is delta == 1?
+	bne	1f		| if so round to even
+	movel	d1,d3		| 
+	andl	IMM (2),d3	| bit 1 is the last significant bit
+	movel	IMM (0),d2	|
+	addl	d3,d1		|
+	addxl	d2,d0		|
+	bra	2f		| 
+1:	movel	IMM (1),d3	| else add 1 
+	movel	IMM (0),d2	|
+	addl	d3,d1		|
+	addxl	d2,d0
+| Shift right once (because we used bit #DBL_MANT_DIG-32!).
+2:
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1		
+#else
+	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	10f
+	bset	IMM (31),d1
+10:	lsrl	IMM (1),d0
+#endif
+
+| Now check again bit #DBL_MANT_DIG-32 (rounding could have produced a
+| 'fraction overflow' ...).
+	btst	IMM (DBL_MANT_DIG-32),d0	
+	beq	1f
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	addw	IMM (1),d4
+#else
+	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	10f
+	bset	IMM (31),d1
+10:	lsrl	IMM (1),d0
+	addl	IMM (1),d4
+#endif
+1:
+| If bit #DBL_MANT_DIG-32-1 is clear we have a denormalized number, so we 
+| have to put the exponent to zero and return a denormalized number.
+	btst	IMM (DBL_MANT_DIG-32-1),d0
+	beq	1f
+	jmp	a0@
+1:	movel	IMM (0),d4
+	jmp	a0@
+
+Lround$to$zero:
+Lround$to$plus:
+Lround$to$minus:
+	jmp	a0@
+#endif /* L_double */
+
+#ifdef  L_float
+
+	.globl	SYM (_fpCCR)
+	.globl  $_exception_handler
+
+QUIET_NaN    = 0xffffffff
+SIGNL_NaN    = 0x7f800001
+INFINITY     = 0x7f800000
+
+F_MAX_EXP      = 0xff
+F_BIAS         = 126
+FLT_MAX_EXP    = F_MAX_EXP - F_BIAS
+FLT_MIN_EXP    = 1 - F_BIAS
+FLT_MANT_DIG   = 24
+
+INEXACT_RESULT 		= 0x0001
+UNDERFLOW 		= 0x0002
+OVERFLOW 		= 0x0004
+DIVIDE_BY_ZERO 		= 0x0008
+INVALID_OPERATION 	= 0x0010
+
+SINGLE_FLOAT = 1
+
+NOOP         = 0
+ADD          = 1
+MULTIPLY     = 2
+DIVIDE       = 3
+NEGATE       = 4
+COMPARE      = 5
+EXTENDSFDF   = 6
+TRUNCDFSF    = 7
+
+UNKNOWN           = -1
+ROUND_TO_NEAREST  = 0 | round result to nearest representable value
+ROUND_TO_ZERO     = 1 | round result towards zero
+ROUND_TO_PLUS     = 2 | round result towards plus infinity
+ROUND_TO_MINUS    = 3 | round result towards minus infinity
+
+| Entry points:
+
+	.globl SYM (__addsf3)
+	.globl SYM (__subsf3)
+	.globl SYM (__mulsf3)
+	.globl SYM (__divsf3)
+	.globl SYM (__negsf2)
+	.globl SYM (__cmpsf2)
+	.globl SYM (__cmpsf2_internal)
+	.hidden SYM (__cmpsf2_internal)
+
+| These are common routines to return and signal exceptions.	
+
+	.text
+	.even
+
+Lf$den:
+| Return and signal a denormalized number
+	orl	d7,d0
+	moveq	IMM (INEXACT_RESULT+UNDERFLOW),d7
+	moveq	IMM (SINGLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+Lf$infty:
+Lf$overflow:
+| Return a properly signed INFINITY and set the exception flags 
+	movel	IMM (INFINITY),d0
+	orl	d7,d0
+	moveq	IMM (INEXACT_RESULT+OVERFLOW),d7
+	moveq	IMM (SINGLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+Lf$underflow:
+| Return 0 and set the exception flags 
+	moveq	IMM (0),d0
+	moveq	IMM (INEXACT_RESULT+UNDERFLOW),d7
+	moveq	IMM (SINGLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+Lf$inop:
+| Return a quiet NaN and set the exception flags
+	movel	IMM (QUIET_NaN),d0
+	moveq	IMM (INEXACT_RESULT+INVALID_OPERATION),d7
+	moveq	IMM (SINGLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+Lf$div$0:
+| Return a properly signed INFINITY and set the exception flags
+	movel	IMM (INFINITY),d0
+	orl	d7,d0
+	moveq	IMM (INEXACT_RESULT+DIVIDE_BY_ZERO),d7
+	moveq	IMM (SINGLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+|=============================================================================
+|=============================================================================
+|                         single precision routines
+|=============================================================================
+|=============================================================================
+
+| A single precision floating point number (float) has the format:
+|
+| struct _float {
+|  unsigned int sign      : 1;  /* sign bit */ 
+|  unsigned int exponent  : 8;  /* exponent, shifted by 126 */
+|  unsigned int fraction  : 23; /* fraction */
+| } float;
+| 
+| Thus sizeof(float) = 4 (32 bits). 
+|
+| All the routines are callable from C programs, and return the result 
+| in the single register d0. They also preserve all registers except 
+| d0-d1 and a0-a1.
+
+|=============================================================================
+|                              __subsf3
+|=============================================================================
+
+| float __subsf3(float, float);
+	FUNC(__subsf3)
+SYM (__subsf3):
+	bchg	IMM (31),sp@(8)	| change sign of second operand
+				| and fall through
+|=============================================================================
+|                              __addsf3
+|=============================================================================
+
+| float __addsf3(float, float);
+	FUNC(__addsf3)
+SYM (__addsf3):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)	| everything will be done in registers
+	moveml	d2-d7,sp@-	| save all data registers but d0-d1
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	movel	a6@(8),d0	| get first operand
+	movel	a6@(12),d1	| get second operand
+	movel	d0,a0		| get d0's sign bit '
+	addl	d0,d0		| check and clear sign bit of a
+	beq	Laddsf$b	| if zero return second operand
+	movel	d1,a1		| save b's sign bit '
+	addl	d1,d1		| get rid of sign bit
+	beq	Laddsf$a	| if zero return first operand
+
+| Get the exponents and check for denormalized and/or infinity.
+
+	movel	IMM (0x00ffffff),d4	| mask to get fraction
+	movel	IMM (0x01000000),d5	| mask to put hidden bit back
+
+	movel	d0,d6		| save a to get exponent
+	andl	d4,d0		| get fraction in d0
+	notl 	d4		| make d4 into a mask for the exponent
+	andl	d4,d6		| get exponent in d6
+	beq	Laddsf$a$den	| branch if a is denormalized
+	cmpl	d4,d6		| check for INFINITY or NaN
+	beq	Laddsf$nf
+	swap	d6		| put exponent into first word
+	orl	d5,d0		| and put hidden bit back
+Laddsf$1:
+| Now we have a's exponent in d6 (second byte) and the mantissa in d0. '
+	movel	d1,d7		| get exponent in d7
+	andl	d4,d7		| 
+	beq	Laddsf$b$den	| branch if b is denormalized
+	cmpl	d4,d7		| check for INFINITY or NaN
+	beq	Laddsf$nf
+	swap	d7		| put exponent into first word
+	notl 	d4		| make d4 into a mask for the fraction
+	andl	d4,d1		| get fraction in d1
+	orl	d5,d1		| and put hidden bit back
+Laddsf$2:
+| Now we have b's exponent in d7 (second byte) and the mantissa in d1. '
+
+| Note that the hidden bit corresponds to bit #FLT_MANT_DIG-1, and we 
+| shifted right once, so bit #FLT_MANT_DIG is set (so we have one extra
+| bit).
+
+	movel	d1,d2		| move b to d2, since we want to use
+				| two registers to do the sum
+	movel	IMM (0),d1	| and clear the new ones
+	movel	d1,d3		|
+
+| Here we shift the numbers in registers d0 and d1 so the exponents are the
+| same, and put the largest exponent in d6. Note that we are using two
+| registers for each number (see the discussion by D. Knuth in "Seminumerical 
+| Algorithms").
+#ifndef __mcoldfire__
+	cmpw	d6,d7		| compare exponents
+#else
+	cmpl	d6,d7		| compare exponents
+#endif
+	beq	Laddsf$3	| if equal don't shift '
+	bhi	5f		| branch if second exponent largest
+1:
+	subl	d6,d7		| keep the largest exponent
+	negl	d7
+#ifndef __mcoldfire__
+	lsrw	IMM (8),d7	| put difference in lower byte
+#else
+	lsrl	IMM (8),d7	| put difference in lower byte
+#endif
+| if difference is too large we don't shift (actually, we can just exit) '
+#ifndef __mcoldfire__
+	cmpw	IMM (FLT_MANT_DIG+2),d7		
+#else
+	cmpl	IMM (FLT_MANT_DIG+2),d7		
+#endif
+	bge	Laddsf$b$small
+#ifndef __mcoldfire__
+	cmpw	IMM (16),d7	| if difference >= 16 swap
+#else
+	cmpl	IMM (16),d7	| if difference >= 16 swap
+#endif
+	bge	4f
+2:
+#ifndef __mcoldfire__
+	subw	IMM (1),d7
+#else
+	subql	IMM (1), d7
+#endif
+3:
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d2	| shift right second operand
+	roxrl	IMM (1),d3
+	dbra	d7,3b
+#else
+	lsrl	IMM (1),d3
+	btst	IMM (0),d2
+	beq	10f
+	bset	IMM (31),d3
+10:	lsrl	IMM (1),d2
+	subql	IMM (1), d7
+	bpl	3b
+#endif
+	bra	Laddsf$3
+4:
+	movew	d2,d3
+	swap	d3
+	movew	d3,d2
+	swap	d2
+#ifndef __mcoldfire__
+	subw	IMM (16),d7
+#else
+	subl	IMM (16),d7
+#endif
+	bne	2b		| if still more bits, go back to normal case
+	bra	Laddsf$3
+5:
+#ifndef __mcoldfire__
+	exg	d6,d7		| exchange the exponents
+#else
+	eorl	d6,d7
+	eorl	d7,d6
+	eorl	d6,d7
+#endif
+	subl	d6,d7		| keep the largest exponent
+	negl	d7		|
+#ifndef __mcoldfire__
+	lsrw	IMM (8),d7	| put difference in lower byte
+#else
+	lsrl	IMM (8),d7	| put difference in lower byte
+#endif
+| if difference is too large we don't shift (and exit!) '
+#ifndef __mcoldfire__
+	cmpw	IMM (FLT_MANT_DIG+2),d7		
+#else
+	cmpl	IMM (FLT_MANT_DIG+2),d7		
+#endif
+	bge	Laddsf$a$small
+#ifndef __mcoldfire__
+	cmpw	IMM (16),d7	| if difference >= 16 swap
+#else
+	cmpl	IMM (16),d7	| if difference >= 16 swap
+#endif
+	bge	8f
+6:
+#ifndef __mcoldfire__
+	subw	IMM (1),d7
+#else
+	subl	IMM (1),d7
+#endif
+7:
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0	| shift right first operand
+	roxrl	IMM (1),d1
+	dbra	d7,7b
+#else
+	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	10f
+	bset	IMM (31),d1
+10:	lsrl	IMM (1),d0
+	subql	IMM (1),d7
+	bpl	7b
+#endif
+	bra	Laddsf$3
+8:
+	movew	d0,d1
+	swap	d1
+	movew	d1,d0
+	swap	d0
+#ifndef __mcoldfire__
+	subw	IMM (16),d7
+#else
+	subl	IMM (16),d7
+#endif
+	bne	6b		| if still more bits, go back to normal case
+				| otherwise we fall through
+
+| Now we have a in d0-d1, b in d2-d3, and the largest exponent in d6 (the
+| signs are stored in a0 and a1).
+
+Laddsf$3:
+| Here we have to decide whether to add or subtract the numbers
+#ifndef __mcoldfire__
+	exg	d6,a0		| get signs back
+	exg	d7,a1		| and save the exponents
+#else
+	movel	d6,d4
+	movel	a0,d6
+	movel	d4,a0
+	movel	d7,d4
+	movel	a1,d7
+	movel	d4,a1
+#endif
+	eorl	d6,d7		| combine sign bits
+	bmi	Lsubsf$0	| if negative a and b have opposite 
+				| sign so we actually subtract the
+				| numbers
+
+| Here we have both positive or both negative
+#ifndef __mcoldfire__
+	exg	d6,a0		| now we have the exponent in d6
+#else
+	movel	d6,d4
+	movel	a0,d6
+	movel	d4,a0
+#endif
+	movel	a0,d7		| and sign in d7
+	andl	IMM (0x80000000),d7
+| Here we do the addition.
+	addl	d3,d1
+	addxl	d2,d0
+| Note: now we have d2, d3, d4 and d5 to play with! 
+
+| Put the exponent, in the first byte, in d2, to use the "standard" rounding
+| routines:
+	movel	d6,d2
+#ifndef __mcoldfire__
+	lsrw	IMM (8),d2
+#else
+	lsrl	IMM (8),d2
+#endif
+
+| Before rounding normalize so bit #FLT_MANT_DIG is set (we will consider
+| the case of denormalized numbers in the rounding routine itself).
+| As in the addition (not in the subtraction!) we could have set 
+| one more bit we check this:
+	btst	IMM (FLT_MANT_DIG+1),d0	
+	beq	1f
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+#else
+	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	10f
+	bset	IMM (31),d1
+10:	lsrl	IMM (1),d0
+#endif
+	addl	IMM (1),d2
+1:
+	lea	pc@(Laddsf$4),a0 | to return from rounding routine
+	PICLEA	SYM (_fpCCR),a1	| check the rounding mode
+#ifdef __mcoldfire__
+	clrl	d6
+#endif
+	movew	a1@(6),d6	| rounding mode in d6
+	beq	Lround$to$nearest
+#ifndef __mcoldfire__
+	cmpw	IMM (ROUND_TO_PLUS),d6
+#else
+	cmpl	IMM (ROUND_TO_PLUS),d6
+#endif
+	bhi	Lround$to$minus
+	blt	Lround$to$zero
+	bra	Lround$to$plus
+Laddsf$4:
+| Put back the exponent, but check for overflow.
+#ifndef __mcoldfire__
+	cmpw	IMM (0xff),d2
+#else
+	cmpl	IMM (0xff),d2
+#endif
+	bhi	1f
+	bclr	IMM (FLT_MANT_DIG-1),d0
+#ifndef __mcoldfire__
+	lslw	IMM (7),d2
+#else
+	lsll	IMM (7),d2
+#endif
+	swap	d2
+	orl	d2,d0
+	bra	Laddsf$ret
+1:
+	moveq	IMM (ADD),d5
+	bra	Lf$overflow
+
+Lsubsf$0:
+| We are here if a > 0 and b < 0 (sign bits cleared).
+| Here we do the subtraction.
+	movel	d6,d7		| put sign in d7
+	andl	IMM (0x80000000),d7
+
+	subl	d3,d1		| result in d0-d1
+	subxl	d2,d0		|
+	beq	Laddsf$ret	| if zero just exit
+	bpl	1f		| if positive skip the following
+	bchg	IMM (31),d7	| change sign bit in d7
+	negl	d1
+	negxl	d0
+1:
+#ifndef __mcoldfire__
+	exg	d2,a0		| now we have the exponent in d2
+	lsrw	IMM (8),d2	| put it in the first byte
+#else
+	movel	d2,d4
+	movel	a0,d2
+	movel	d4,a0
+	lsrl	IMM (8),d2	| put it in the first byte
+#endif
+
+| Now d0-d1 is positive and the sign bit is in d7.
+
+| Note that we do not have to normalize, since in the subtraction bit
+| #FLT_MANT_DIG+1 is never set, and denormalized numbers are handled by
+| the rounding routines themselves.
+	lea	pc@(Lsubsf$1),a0 | to return from rounding routine
+	PICLEA	SYM (_fpCCR),a1	| check the rounding mode
+#ifdef __mcoldfire__
+	clrl	d6
+#endif
+	movew	a1@(6),d6	| rounding mode in d6
+	beq	Lround$to$nearest
+#ifndef __mcoldfire__
+	cmpw	IMM (ROUND_TO_PLUS),d6
+#else
+	cmpl	IMM (ROUND_TO_PLUS),d6
+#endif
+	bhi	Lround$to$minus
+	blt	Lround$to$zero
+	bra	Lround$to$plus
+Lsubsf$1:
+| Put back the exponent (we can't have overflow!). '
+	bclr	IMM (FLT_MANT_DIG-1),d0
+#ifndef __mcoldfire__
+	lslw	IMM (7),d2
+#else
+	lsll	IMM (7),d2
+#endif
+	swap	d2
+	orl	d2,d0
+	bra	Laddsf$ret
+
+| If one of the numbers was too small (difference of exponents >= 
+| FLT_MANT_DIG+2) we return the other (and now we don't have to '
+| check for finiteness or zero).
+Laddsf$a$small:
+	movel	a6@(12),d0
+	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7	| restore data registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6		| and return
+	rts
+
+Laddsf$b$small:
+	movel	a6@(8),d0
+	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7	| restore data registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6		| and return
+	rts
+
+| If the numbers are denormalized remember to put exponent equal to 1.
+
+Laddsf$a$den:
+	movel	d5,d6		| d5 contains 0x01000000
+	swap	d6
+	bra	Laddsf$1
+
+Laddsf$b$den:
+	movel	d5,d7
+	swap	d7
+	notl 	d4		| make d4 into a mask for the fraction
+				| (this was not executed after the jump)
+	bra	Laddsf$2
+
+| The rest is mainly code for the different results which can be 
+| returned (checking always for +/-INFINITY and NaN).
+
+Laddsf$b:
+| Return b (if a is zero).
+	movel	a6@(12),d0
+	cmpl	IMM (0x80000000),d0	| Check if b is -0
+	bne	1f
+	movel	a0,d7
+	andl	IMM (0x80000000),d7	| Use the sign of a
+	clrl	d0
+	bra	Laddsf$ret
+Laddsf$a:
+| Return a (if b is zero).
+	movel	a6@(8),d0
+1:
+	moveq	IMM (ADD),d5
+| We have to check for NaN and +/-infty.
+	movel	d0,d7
+	andl	IMM (0x80000000),d7	| put sign in d7
+	bclr	IMM (31),d0		| clear sign
+	cmpl	IMM (INFINITY),d0	| check for infty or NaN
+	bge	2f
+	movel	d0,d0		| check for zero (we do this because we don't '
+	bne	Laddsf$ret	| want to return -0 by mistake
+	bclr	IMM (31),d7	| if zero be sure to clear sign
+	bra	Laddsf$ret	| if everything OK just return
+2:
+| The value to be returned is either +/-infty or NaN
+	andl	IMM (0x007fffff),d0	| check for NaN
+	bne	Lf$inop			| if mantissa not zero is NaN
+	bra	Lf$infty
+
+Laddsf$ret:
+| Normal exit (a and b nonzero, result is not NaN nor +/-infty).
+| We have to clear the exception flags (just the exception type).
+	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+	orl	d7,d0		| put sign bit
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7	| restore data registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6		| and return
+	rts
+
+Laddsf$ret$den:
+| Return a denormalized number (for addition we don't signal underflow) '
+	lsrl	IMM (1),d0	| remember to shift right back once
+	bra	Laddsf$ret	| and return
+
+| Note: when adding two floats of the same sign if either one is 
+| NaN we return NaN without regard to whether the other is finite or 
+| not. When subtracting them (i.e., when adding two numbers of 
+| opposite signs) things are more complicated: if both are INFINITY 
+| we return NaN, if only one is INFINITY and the other is NaN we return
+| NaN, but if it is finite we return INFINITY with the corresponding sign.
+
+Laddsf$nf:
+	moveq	IMM (ADD),d5
+| This could be faster but it is not worth the effort, since it is not
+| executed very often. We sacrifice speed for clarity here.
+	movel	a6@(8),d0	| get the numbers back (remember that we
+	movel	a6@(12),d1	| did some processing already)
+	movel	IMM (INFINITY),d4 | useful constant (INFINITY)
+	movel	d0,d2		| save sign bits
+	movel	d1,d3
+	bclr	IMM (31),d0	| clear sign bits
+	bclr	IMM (31),d1
+| We know that one of them is either NaN of +/-INFINITY
+| Check for NaN (if either one is NaN return NaN)
+	cmpl	d4,d0		| check first a (d0)
+	bhi	Lf$inop		
+	cmpl	d4,d1		| check now b (d1)
+	bhi	Lf$inop		
+| Now comes the check for +/-INFINITY. We know that both are (maybe not
+| finite) numbers, but we have to check if both are infinite whether we
+| are adding or subtracting them.
+	eorl	d3,d2		| to check sign bits
+	bmi	1f
+	movel	d0,d7
+	andl	IMM (0x80000000),d7	| get (common) sign bit
+	bra	Lf$infty
+1:
+| We know one (or both) are infinite, so we test for equality between the
+| two numbers (if they are equal they have to be infinite both, so we
+| return NaN).
+	cmpl	d1,d0		| are both infinite?
+	beq	Lf$inop		| if so return NaN
+
+	movel	d0,d7
+	andl	IMM (0x80000000),d7 | get a's sign bit '
+	cmpl	d4,d0		| test now for infinity
+	beq	Lf$infty	| if a is INFINITY return with this sign
+	bchg	IMM (31),d7	| else we know b is INFINITY and has
+	bra	Lf$infty	| the opposite sign
+
+|=============================================================================
+|                             __mulsf3
+|=============================================================================
+
+| float __mulsf3(float, float);
+	FUNC(__mulsf3)
+SYM (__mulsf3):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)
+	moveml	d2-d7,sp@-
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	movel	a6@(8),d0	| get a into d0
+	movel	a6@(12),d1	| and b into d1
+	movel	d0,d7		| d7 will hold the sign of the product
+	eorl	d1,d7		|
+	andl	IMM (0x80000000),d7
+	movel	IMM (INFINITY),d6	| useful constant (+INFINITY)
+	movel	d6,d5			| another (mask for fraction)
+	notl	d5			|
+	movel	IMM (0x00800000),d4	| this is to put hidden bit back
+	bclr	IMM (31),d0		| get rid of a's sign bit '
+	movel	d0,d2			|
+	beq	Lmulsf$a$0		| branch if a is zero
+	bclr	IMM (31),d1		| get rid of b's sign bit '
+	movel	d1,d3		|
+	beq	Lmulsf$b$0	| branch if b is zero
+	cmpl	d6,d0		| is a big?
+	bhi	Lmulsf$inop	| if a is NaN return NaN
+	beq	Lmulsf$inf	| if a is INFINITY we have to check b
+	cmpl	d6,d1		| now compare b with INFINITY
+	bhi	Lmulsf$inop	| is b NaN?
+	beq	Lmulsf$overflow | is b INFINITY?
+| Here we have both numbers finite and nonzero (and with no sign bit).
+| Now we get the exponents into d2 and d3.
+	andl	d6,d2		| and isolate exponent in d2
+	beq	Lmulsf$a$den	| if exponent is zero we have a denormalized
+	andl	d5,d0		| and isolate fraction
+	orl	d4,d0		| and put hidden bit back
+	swap	d2		| I like exponents in the first byte
+#ifndef __mcoldfire__
+	lsrw	IMM (7),d2	| 
+#else
+	lsrl	IMM (7),d2	| 
+#endif
+Lmulsf$1:			| number
+	andl	d6,d3		|
+	beq	Lmulsf$b$den	|
+	andl	d5,d1		|
+	orl	d4,d1		|
+	swap	d3		|
+#ifndef __mcoldfire__
+	lsrw	IMM (7),d3	|
+#else
+	lsrl	IMM (7),d3	|
+#endif
+Lmulsf$2:			|
+#ifndef __mcoldfire__
+	addw	d3,d2		| add exponents
+	subw	IMM (F_BIAS+1),d2 | and subtract bias (plus one)
+#else
+	addl	d3,d2		| add exponents
+	subl	IMM (F_BIAS+1),d2 | and subtract bias (plus one)
+#endif
+
+| We are now ready to do the multiplication. The situation is as follows:
+| both a and b have bit FLT_MANT_DIG-1 set (even if they were 
+| denormalized to start with!), which means that in the product 
+| bit 2*(FLT_MANT_DIG-1) (that is, bit 2*FLT_MANT_DIG-2-32 of the 
+| high long) is set. 
+
+| To do the multiplication let us move the number a little bit around ...
+	movel	d1,d6		| second operand in d6
+	movel	d0,d5		| first operand in d4-d5
+	movel	IMM (0),d4
+	movel	d4,d1		| the sums will go in d0-d1
+	movel	d4,d0
+
+| now bit FLT_MANT_DIG-1 becomes bit 31:
+	lsll	IMM (31-FLT_MANT_DIG+1),d6		
+
+| Start the loop (we loop #FLT_MANT_DIG times):
+	moveq	IMM (FLT_MANT_DIG-1),d3	
+1:	addl	d1,d1		| shift sum 
+	addxl	d0,d0
+	lsll	IMM (1),d6	| get bit bn
+	bcc	2f		| if not set skip sum
+	addl	d5,d1		| add a
+	addxl	d4,d0
+2:
+#ifndef __mcoldfire__
+	dbf	d3,1b		| loop back
+#else
+	subql	IMM (1),d3
+	bpl	1b
+#endif
+
+| Now we have the product in d0-d1, with bit (FLT_MANT_DIG - 1) + FLT_MANT_DIG
+| (mod 32) of d0 set. The first thing to do now is to normalize it so bit 
+| FLT_MANT_DIG is set (to do the rounding).
+#ifndef __mcoldfire__
+	rorl	IMM (6),d1
+	swap	d1
+	movew	d1,d3
+	andw	IMM (0x03ff),d3
+	andw	IMM (0xfd00),d1
+#else
+	movel	d1,d3
+	lsll	IMM (8),d1
+	addl	d1,d1
+	addl	d1,d1
+	moveq	IMM (22),d5
+	lsrl	d5,d3
+	orl	d3,d1
+	andl	IMM (0xfffffd00),d1
+#endif
+	lsll	IMM (8),d0
+	addl	d0,d0
+	addl	d0,d0
+#ifndef __mcoldfire__
+	orw	d3,d0
+#else
+	orl	d3,d0
+#endif
+
+	moveq	IMM (MULTIPLY),d5
+	
+	btst	IMM (FLT_MANT_DIG+1),d0
+	beq	Lround$exit
+#ifndef __mcoldfire__
+	lsrl	IMM (1),d0
+	roxrl	IMM (1),d1
+	addw	IMM (1),d2
+#else
+	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	10f
+	bset	IMM (31),d1
+10:	lsrl	IMM (1),d0
+	addql	IMM (1),d2
+#endif
+	bra	Lround$exit
+
+Lmulsf$inop:
+	moveq	IMM (MULTIPLY),d5
+	bra	Lf$inop
+
+Lmulsf$overflow:
+	moveq	IMM (MULTIPLY),d5
+	bra	Lf$overflow
+
+Lmulsf$inf:
+	moveq	IMM (MULTIPLY),d5
+| If either is NaN return NaN; else both are (maybe infinite) numbers, so
+| return INFINITY with the correct sign (which is in d7).
+	cmpl	d6,d1		| is b NaN?
+	bhi	Lf$inop		| if so return NaN
+	bra	Lf$overflow	| else return +/-INFINITY
+
+| If either number is zero return zero, unless the other is +/-INFINITY, 
+| or NaN, in which case we return NaN.
+Lmulsf$b$0:
+| Here d1 (==b) is zero.
+	movel	a6@(8),d1	| get a again to check for non-finiteness
+	bra	1f
+Lmulsf$a$0:
+	movel	a6@(12),d1	| get b again to check for non-finiteness
+1:	bclr	IMM (31),d1	| clear sign bit 
+	cmpl	IMM (INFINITY),d1 | and check for a large exponent
+	bge	Lf$inop		| if b is +/-INFINITY or NaN return NaN
+	movel	d7,d0		| else return signed zero
+	PICLEA	SYM (_fpCCR),a0	|
+	movew	IMM (0),a0@	| 
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7	| 
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6		| 
+	rts			| 
+
+| If a number is denormalized we put an exponent of 1 but do not put the 
+| hidden bit back into the fraction; instead we shift left until bit 23
+| (the hidden bit) is set, adjusting the exponent accordingly. We do this
+| to ensure that the product of the fractions is close to 1.
+Lmulsf$a$den:
+	movel	IMM (1),d2
+	andl	d5,d0
+1:	addl	d0,d0		| shift a left (until bit 23 is set)
+#ifndef __mcoldfire__
+	subw	IMM (1),d2	| and adjust exponent
+#else
+	subql	IMM (1),d2	| and adjust exponent
+#endif
+	btst	IMM (FLT_MANT_DIG-1),d0
+	bne	Lmulsf$1	|
+	bra	1b		| else loop back
+
+Lmulsf$b$den:
+	movel	IMM (1),d3
+	andl	d5,d1
+1:	addl	d1,d1		| shift b left until bit 23 is set
+#ifndef __mcoldfire__
+	subw	IMM (1),d3	| and adjust exponent
+#else
+	subql	IMM (1),d3	| and adjust exponent
+#endif
+	btst	IMM (FLT_MANT_DIG-1),d1
+	bne	Lmulsf$2	|
+	bra	1b		| else loop back
+
+|=============================================================================
+|                             __divsf3
+|=============================================================================
+
+| float __divsf3(float, float);
+	FUNC(__divsf3)
+SYM (__divsf3):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)
+	moveml	d2-d7,sp@-
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	movel	a6@(8),d0		| get a into d0
+	movel	a6@(12),d1		| and b into d1
+	movel	d0,d7			| d7 will hold the sign of the result
+	eorl	d1,d7			|
+	andl	IMM (0x80000000),d7	| 
+	movel	IMM (INFINITY),d6	| useful constant (+INFINITY)
+	movel	d6,d5			| another (mask for fraction)
+	notl	d5			|
+	movel	IMM (0x00800000),d4	| this is to put hidden bit back
+	bclr	IMM (31),d0		| get rid of a's sign bit '
+	movel	d0,d2			|
+	beq	Ldivsf$a$0		| branch if a is zero
+	bclr	IMM (31),d1		| get rid of b's sign bit '
+	movel	d1,d3			|
+	beq	Ldivsf$b$0		| branch if b is zero
+	cmpl	d6,d0			| is a big?
+	bhi	Ldivsf$inop		| if a is NaN return NaN
+	beq	Ldivsf$inf		| if a is INFINITY we have to check b
+	cmpl	d6,d1			| now compare b with INFINITY 
+	bhi	Ldivsf$inop		| if b is NaN return NaN
+	beq	Ldivsf$underflow
+| Here we have both numbers finite and nonzero (and with no sign bit).
+| Now we get the exponents into d2 and d3 and normalize the numbers to
+| ensure that the ratio of the fractions is close to 1. We do this by
+| making sure that bit #FLT_MANT_DIG-1 (hidden bit) is set.
+	andl	d6,d2		| and isolate exponent in d2
+	beq	Ldivsf$a$den	| if exponent is zero we have a denormalized
+	andl	d5,d0		| and isolate fraction
+	orl	d4,d0		| and put hidden bit back
+	swap	d2		| I like exponents in the first byte
+#ifndef __mcoldfire__
+	lsrw	IMM (7),d2	| 
+#else
+	lsrl	IMM (7),d2	| 
+#endif
+Ldivsf$1:			| 
+	andl	d6,d3		|
+	beq	Ldivsf$b$den	|
+	andl	d5,d1		|
+	orl	d4,d1		|
+	swap	d3		|
+#ifndef __mcoldfire__
+	lsrw	IMM (7),d3	|
+#else
+	lsrl	IMM (7),d3	|
+#endif
+Ldivsf$2:			|
+#ifndef __mcoldfire__
+	subw	d3,d2		| subtract exponents
+ 	addw	IMM (F_BIAS),d2	| and add bias
+#else
+	subl	d3,d2		| subtract exponents
+ 	addl	IMM (F_BIAS),d2	| and add bias
+#endif
+ 
+| We are now ready to do the division. We have prepared things in such a way
+| that the ratio of the fractions will be less than 2 but greater than 1/2.
+| At this point the registers in use are:
+| d0	holds a (first operand, bit FLT_MANT_DIG=0, bit FLT_MANT_DIG-1=1)
+| d1	holds b (second operand, bit FLT_MANT_DIG=1)
+| d2	holds the difference of the exponents, corrected by the bias
+| d7	holds the sign of the ratio
+| d4, d5, d6 hold some constants
+	movel	d7,a0		| d6-d7 will hold the ratio of the fractions
+	movel	IMM (0),d6	| 
+	movel	d6,d7
+
+	moveq	IMM (FLT_MANT_DIG+1),d3
+1:	cmpl	d0,d1		| is a < b?
+	bhi	2f		|
+	bset	d3,d6		| set a bit in d6
+	subl	d1,d0		| if a >= b  a <-- a-b
+	beq	3f		| if a is zero, exit
+2:	addl	d0,d0		| multiply a by 2
+#ifndef __mcoldfire__
+	dbra	d3,1b
+#else
+	subql	IMM (1),d3
+	bpl	1b
+#endif
+
+| Now we keep going to set the sticky bit ...
+	moveq	IMM (FLT_MANT_DIG),d3
+1:	cmpl	d0,d1
+	ble	2f
+	addl	d0,d0
+#ifndef __mcoldfire__
+	dbra	d3,1b
+#else
+	subql	IMM(1),d3
+	bpl	1b
+#endif
+	movel	IMM (0),d1
+	bra	3f
+2:	movel	IMM (0),d1
+#ifndef __mcoldfire__
+	subw	IMM (FLT_MANT_DIG),d3
+	addw	IMM (31),d3
+#else
+	subl	IMM (FLT_MANT_DIG),d3
+	addl	IMM (31),d3
+#endif
+	bset	d3,d1
+3:
+	movel	d6,d0		| put the ratio in d0-d1
+	movel	a0,d7		| get sign back
+
+| Because of the normalization we did before we are guaranteed that 
+| d0 is smaller than 2^26 but larger than 2^24. Thus bit 26 is not set,
+| bit 25 could be set, and if it is not set then bit 24 is necessarily set.
+	btst	IMM (FLT_MANT_DIG+1),d0		
+	beq	1f              | if it is not set, then bit 24 is set
+	lsrl	IMM (1),d0	|
+#ifndef __mcoldfire__
+	addw	IMM (1),d2	|
+#else
+	addl	IMM (1),d2	|
+#endif
+1:
+| Now round, check for over- and underflow, and exit.
+	moveq	IMM (DIVIDE),d5
+	bra	Lround$exit
+
+Ldivsf$inop:
+	moveq	IMM (DIVIDE),d5
+	bra	Lf$inop
+
+Ldivsf$overflow:
+	moveq	IMM (DIVIDE),d5
+	bra	Lf$overflow
+
+Ldivsf$underflow:
+	moveq	IMM (DIVIDE),d5
+	bra	Lf$underflow
+
+Ldivsf$a$0:
+	moveq	IMM (DIVIDE),d5
+| If a is zero check to see whether b is zero also. In that case return
+| NaN; then check if b is NaN, and return NaN also in that case. Else
+| return a properly signed zero.
+	andl	IMM (0x7fffffff),d1	| clear sign bit and test b
+	beq	Lf$inop			| if b is also zero return NaN
+	cmpl	IMM (INFINITY),d1	| check for NaN
+	bhi	Lf$inop			| 
+	movel	d7,d0			| else return signed zero
+	PICLEA	SYM (_fpCCR),a0		|
+	movew	IMM (0),a0@		|
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7		| 
+#else
+	moveml	sp@,d2-d7		| 
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6			| 
+	rts				| 
+	
+Ldivsf$b$0:
+	moveq	IMM (DIVIDE),d5
+| If we got here a is not zero. Check if a is NaN; in that case return NaN,
+| else return +/-INFINITY. Remember that a is in d0 with the sign bit 
+| cleared already.
+	cmpl	IMM (INFINITY),d0	| compare d0 with INFINITY
+	bhi	Lf$inop			| if larger it is NaN
+	bra	Lf$div$0		| else signal DIVIDE_BY_ZERO
+
+Ldivsf$inf:
+	moveq	IMM (DIVIDE),d5
+| If a is INFINITY we have to check b
+	cmpl	IMM (INFINITY),d1	| compare b with INFINITY 
+	bge	Lf$inop			| if b is NaN or INFINITY return NaN
+	bra	Lf$overflow		| else return overflow
+
+| If a number is denormalized we put an exponent of 1 but do not put the 
+| bit back into the fraction.
+Ldivsf$a$den:
+	movel	IMM (1),d2
+	andl	d5,d0
+1:	addl	d0,d0		| shift a left until bit FLT_MANT_DIG-1 is set
+#ifndef __mcoldfire__
+	subw	IMM (1),d2	| and adjust exponent
+#else
+	subl	IMM (1),d2	| and adjust exponent
+#endif
+	btst	IMM (FLT_MANT_DIG-1),d0
+	bne	Ldivsf$1
+	bra	1b
+
+Ldivsf$b$den:
+	movel	IMM (1),d3
+	andl	d5,d1
+1:	addl	d1,d1		| shift b left until bit FLT_MANT_DIG is set
+#ifndef __mcoldfire__
+	subw	IMM (1),d3	| and adjust exponent
+#else
+	subl	IMM (1),d3	| and adjust exponent
+#endif
+	btst	IMM (FLT_MANT_DIG-1),d1
+	bne	Ldivsf$2
+	bra	1b
+
+Lround$exit:
+| This is a common exit point for __mulsf3 and __divsf3. 
+
+| First check for underlow in the exponent:
+#ifndef __mcoldfire__
+	cmpw	IMM (-FLT_MANT_DIG-1),d2		
+#else
+	cmpl	IMM (-FLT_MANT_DIG-1),d2		
+#endif
+	blt	Lf$underflow	
+| It could happen that the exponent is less than 1, in which case the 
+| number is denormalized. In this case we shift right and adjust the 
+| exponent until it becomes 1 or the fraction is zero (in the latter case 
+| we signal underflow and return zero).
+	movel	IMM (0),d6	| d6 is used temporarily
+#ifndef __mcoldfire__
+	cmpw	IMM (1),d2	| if the exponent is less than 1 we 
+#else
+	cmpl	IMM (1),d2	| if the exponent is less than 1 we 
+#endif
+	bge	2f		| have to shift right (denormalize)
+1:
+#ifndef __mcoldfire__
+	addw	IMM (1),d2	| adjust the exponent
+	lsrl	IMM (1),d0	| shift right once 
+	roxrl	IMM (1),d1	|
+	roxrl	IMM (1),d6	| d6 collect bits we would lose otherwise
+	cmpw	IMM (1),d2	| is the exponent 1 already?
+#else
+	addql	IMM (1),d2	| adjust the exponent
+	lsrl	IMM (1),d6
+	btst	IMM (0),d1
+	beq	11f
+	bset	IMM (31),d6
+11:	lsrl	IMM (1),d1
+	btst	IMM (0),d0
+	beq	10f
+	bset	IMM (31),d1
+10:	lsrl	IMM (1),d0
+	cmpl	IMM (1),d2	| is the exponent 1 already?
+#endif
+	beq	2f		| if not loop back
+	bra	1b              |
+	bra	Lf$underflow	| safety check, shouldn't execute '
+2:	orl	d6,d1		| this is a trick so we don't lose  '
+				| the extra bits which were flushed right
+| Now call the rounding routine (which takes care of denormalized numbers):
+	lea	pc@(Lround$0),a0 | to return from rounding routine
+	PICLEA	SYM (_fpCCR),a1	| check the rounding mode
+#ifdef __mcoldfire__
+	clrl	d6
+#endif
+	movew	a1@(6),d6	| rounding mode in d6
+	beq	Lround$to$nearest
+#ifndef __mcoldfire__
+	cmpw	IMM (ROUND_TO_PLUS),d6
+#else
+	cmpl	IMM (ROUND_TO_PLUS),d6
+#endif
+	bhi	Lround$to$minus
+	blt	Lround$to$zero
+	bra	Lround$to$plus
+Lround$0:
+| Here we have a correctly rounded result (either normalized or denormalized).
+
+| Here we should have either a normalized number or a denormalized one, and
+| the exponent is necessarily larger or equal to 1 (so we don't have to  '
+| check again for underflow!). We have to check for overflow or for a 
+| denormalized number (which also signals underflow).
+| Check for overflow (i.e., exponent >= 255).
+#ifndef __mcoldfire__
+	cmpw	IMM (0x00ff),d2
+#else
+	cmpl	IMM (0x00ff),d2
+#endif
+	bge	Lf$overflow
+| Now check for a denormalized number (exponent==0).
+	movew	d2,d2
+	beq	Lf$den
+1:
+| Put back the exponents and sign and return.
+#ifndef __mcoldfire__
+	lslw	IMM (7),d2	| exponent back to fourth byte
+#else
+	lsll	IMM (7),d2	| exponent back to fourth byte
+#endif
+	bclr	IMM (FLT_MANT_DIG-1),d0
+	swap	d0		| and put back exponent
+#ifndef __mcoldfire__
+	orw	d2,d0		| 
+#else
+	orl	d2,d0
+#endif
+	swap	d0		|
+	orl	d7,d0		| and sign also
+
+	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+
+|=============================================================================
+|                             __negsf2
+|=============================================================================
+
+| This is trivial and could be shorter if we didn't bother checking for NaN '
+| and +/-INFINITY.
+
+| float __negsf2(float);
+	FUNC(__negsf2)
+SYM (__negsf2):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)
+	moveml	d2-d7,sp@-
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	moveq	IMM (NEGATE),d5
+	movel	a6@(8),d0	| get number to negate in d0
+	bchg	IMM (31),d0	| negate
+	movel	d0,d1		| make a positive copy
+	bclr	IMM (31),d1	|
+	tstl	d1		| check for zero
+	beq	2f		| if zero (either sign) return +zero
+	cmpl	IMM (INFINITY),d1 | compare to +INFINITY
+	blt	1f		|
+	bhi	Lf$inop		| if larger (fraction not zero) is NaN
+	movel	d0,d7		| else get sign and return INFINITY
+	andl	IMM (0x80000000),d7
+	bra	Lf$infty		
+1:	PICLEA	SYM (_fpCCR),a0
+	movew	IMM (0),a0@
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+2:	bclr	IMM (31),d0
+	bra	1b
+
+|=============================================================================
+|                             __cmpsf2
+|=============================================================================
+
+GREATER =  1
+LESS    = -1
+EQUAL   =  0
+
+| int __cmpsf2_internal(float, float, int);
+SYM (__cmpsf2_internal):
+#ifndef __mcoldfire__
+	link	a6,IMM (0)
+	moveml	d2-d7,sp@- 	| save registers
+#else
+	link	a6,IMM (-24)
+	moveml	d2-d7,sp@
+#endif
+	moveq	IMM (COMPARE),d5
+	movel	a6@(8),d0	| get first operand
+	movel	a6@(12),d1	| get second operand
+| Check if either is NaN, and in that case return garbage and signal
+| INVALID_OPERATION. Check also if either is zero, and clear the signs
+| if necessary.
+	movel	d0,d6
+	andl	IMM (0x7fffffff),d0
+	beq	Lcmpsf$a$0
+	cmpl	IMM (0x7f800000),d0
+	bhi	Lcmpf$inop
+Lcmpsf$1:
+	movel	d1,d7
+	andl	IMM (0x7fffffff),d1
+	beq	Lcmpsf$b$0
+	cmpl	IMM (0x7f800000),d1
+	bhi	Lcmpf$inop
+Lcmpsf$2:
+| Check the signs
+	eorl	d6,d7
+	bpl	1f
+| If the signs are not equal check if a >= 0
+	tstl	d6
+	bpl	Lcmpsf$a$gt$b	| if (a >= 0 && b < 0) => a > b
+	bmi	Lcmpsf$b$gt$a	| if (a < 0 && b >= 0) => a < b
+1:
+| If the signs are equal check for < 0
+	tstl	d6
+	bpl	1f
+| If both are negative exchange them
+#ifndef __mcoldfire__
+	exg	d0,d1
+#else
+	movel	d0,d7
+	movel	d1,d0
+	movel	d7,d1
+#endif
+1:
+| Now that they are positive we just compare them as longs (does this also
+| work for denormalized numbers?).
+	cmpl	d0,d1
+	bhi	Lcmpsf$b$gt$a	| |b| > |a|
+	bne	Lcmpsf$a$gt$b	| |b| < |a|
+| If we got here a == b.
+	movel	IMM (EQUAL),d0
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7 	| put back the registers
+#else
+	moveml	sp@,d2-d7
+#endif
+	unlk	a6
+	rts
+Lcmpsf$a$gt$b:
+	movel	IMM (GREATER),d0
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7 	| put back the registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+Lcmpsf$b$gt$a:
+	movel	IMM (LESS),d0
+#ifndef __mcoldfire__
+	moveml	sp@+,d2-d7 	| put back the registers
+#else
+	moveml	sp@,d2-d7
+	| XXX if frame pointer is ever removed, stack pointer must
+	| be adjusted here.
+#endif
+	unlk	a6
+	rts
+
+Lcmpsf$a$0:	
+	bclr	IMM (31),d6
+	bra	Lcmpsf$1
+Lcmpsf$b$0:
+	bclr	IMM (31),d7
+	bra	Lcmpsf$2
+
+Lcmpf$inop:
+	movl	a6@(16),d0
+	moveq	IMM (INEXACT_RESULT+INVALID_OPERATION),d7
+	moveq	IMM (SINGLE_FLOAT),d6
+	PICJUMP	$_exception_handler
+
+| int __cmpsf2(float, float);
+	FUNC(__cmpsf2)
+SYM (__cmpsf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL SYM (__cmpsf2_internal)
+	unlk	a6
+	rts
+
+|=============================================================================
+|                           rounding routines
+|=============================================================================
+
+| The rounding routines expect the number to be normalized in registers
+| d0-d1, with the exponent in register d2. They assume that the 
+| exponent is larger or equal to 1. They return a properly normalized number
+| if possible, and a denormalized number otherwise. The exponent is returned
+| in d2.
+
+Lround$to$nearest:
+| We now normalize as suggested by D. Knuth ("Seminumerical Algorithms"):
+| Here we assume that the exponent is not too small (this should be checked
+| before entering the rounding routine), but the number could be denormalized.
+
+| Check for denormalized numbers:
+1:	btst	IMM (FLT_MANT_DIG),d0
+	bne	2f		| if set the number is normalized
+| Normalize shifting left until bit #FLT_MANT_DIG is set or the exponent 
+| is one (remember that a denormalized number corresponds to an 
+| exponent of -F_BIAS+1).
+#ifndef __mcoldfire__
+	cmpw	IMM (1),d2	| remember that the exponent is at least one
+#else
+	cmpl	IMM (1),d2	| remember that the exponent is at least one
+#endif
+ 	beq	2f		| an exponent of one means denormalized
+	addl	d1,d1		| else shift and adjust the exponent
+	addxl	d0,d0		|
+#ifndef __mcoldfire__
+	dbra	d2,1b		|
+#else
+	subql	IMM (1),d2
+	bpl	1b
+#endif
+2:
+| Now round: we do it as follows: after the shifting we can write the
+| fraction part as f + delta, where 1 < f < 2^25, and 0 <= delta <= 2.
+| If delta < 1, do nothing. If delta > 1, add 1 to f. 
+| If delta == 1, we make sure the rounded number will be even (odd?) 
+| (after shifting).
+	btst	IMM (0),d0	| is delta < 1?
+	beq	2f		| if so, do not do anything
+	tstl	d1		| is delta == 1?
+	bne	1f		| if so round to even
+	movel	d0,d1		| 
+	andl	IMM (2),d1	| bit 1 is the last significant bit
+	addl	d1,d0		| 
+	bra	2f		| 
+1:	movel	IMM (1),d1	| else add 1 
+	addl	d1,d0		|
+| Shift right once (because we used bit #FLT_MANT_DIG!).
+2:	lsrl	IMM (1),d0		
+| Now check again bit #FLT_MANT_DIG (rounding could have produced a
+| 'fraction overflow' ...).
+	btst	IMM (FLT_MANT_DIG),d0	
+	beq	1f
+	lsrl	IMM (1),d0
+#ifndef __mcoldfire__
+	addw	IMM (1),d2
+#else
+	addql	IMM (1),d2
+#endif
+1:
+| If bit #FLT_MANT_DIG-1 is clear we have a denormalized number, so we 
+| have to put the exponent to zero and return a denormalized number.
+	btst	IMM (FLT_MANT_DIG-1),d0
+	beq	1f
+	jmp	a0@
+1:	movel	IMM (0),d2
+	jmp	a0@
+
+Lround$to$zero:
+Lround$to$plus:
+Lround$to$minus:
+	jmp	a0@
+#endif /* L_float */
+
+| gcc expects the routines __eqdf2, __nedf2, __gtdf2, __gedf2,
+| __ledf2, __ltdf2 to all return the same value as a direct call to
+| __cmpdf2 would.  In this implementation, each of these routines
+| simply calls __cmpdf2.  It would be more efficient to give the
+| __cmpdf2 routine several names, but separating them out will make it
+| easier to write efficient versions of these routines someday.
+| If the operands recompare unordered unordered __gtdf2 and __gedf2 return -1.
+| The other routines return 1.
+
+#ifdef  L_eqdf2
+	.text
+	FUNC(__eqdf2)
+	.globl	SYM (__eqdf2)
+SYM (__eqdf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(20),sp@-
+	movl	a6@(16),sp@-
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpdf2_internal)
+	unlk	a6
+	rts
+#endif /* L_eqdf2 */
+
+#ifdef  L_nedf2
+	.text
+	FUNC(__nedf2)
+	.globl	SYM (__nedf2)
+SYM (__nedf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(20),sp@-
+	movl	a6@(16),sp@-
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpdf2_internal)
+	unlk	a6
+	rts
+#endif /* L_nedf2 */
+
+#ifdef  L_gtdf2
+	.text
+	FUNC(__gtdf2)
+	.globl	SYM (__gtdf2)
+SYM (__gtdf2):
+	link	a6,IMM (0)
+	pea	-1
+	movl	a6@(20),sp@-
+	movl	a6@(16),sp@-
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpdf2_internal)
+	unlk	a6
+	rts
+#endif /* L_gtdf2 */
+
+#ifdef  L_gedf2
+	.text
+	FUNC(__gedf2)
+	.globl	SYM (__gedf2)
+SYM (__gedf2):
+	link	a6,IMM (0)
+	pea	-1
+	movl	a6@(20),sp@-
+	movl	a6@(16),sp@-
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpdf2_internal)
+	unlk	a6
+	rts
+#endif /* L_gedf2 */
+
+#ifdef  L_ltdf2
+	.text
+	FUNC(__ltdf2)
+	.globl	SYM (__ltdf2)
+SYM (__ltdf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(20),sp@-
+	movl	a6@(16),sp@-
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpdf2_internal)
+	unlk	a6
+	rts
+#endif /* L_ltdf2 */
+
+#ifdef  L_ledf2
+	.text
+	FUNC(__ledf2)
+	.globl	SYM (__ledf2)
+SYM (__ledf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(20),sp@-
+	movl	a6@(16),sp@-
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpdf2_internal)
+	unlk	a6
+	rts
+#endif /* L_ledf2 */
+
+| The comments above about __eqdf2, et. al., also apply to __eqsf2,
+| et. al., except that the latter call __cmpsf2 rather than __cmpdf2.
+
+#ifdef  L_eqsf2
+	.text
+	FUNC(__eqsf2)
+	.globl	SYM (__eqsf2)
+SYM (__eqsf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpsf2_internal)
+	unlk	a6
+	rts
+#endif /* L_eqsf2 */
+
+#ifdef  L_nesf2
+	.text
+	FUNC(__nesf2)
+	.globl	SYM (__nesf2)
+SYM (__nesf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpsf2_internal)
+	unlk	a6
+	rts
+#endif /* L_nesf2 */
+
+#ifdef  L_gtsf2
+	.text
+	FUNC(__gtsf2)
+	.globl	SYM (__gtsf2)
+SYM (__gtsf2):
+	link	a6,IMM (0)
+	pea	-1
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpsf2_internal)
+	unlk	a6
+	rts
+#endif /* L_gtsf2 */
+
+#ifdef  L_gesf2
+	.text
+	FUNC(__gesf2)
+	.globl	SYM (__gesf2)
+SYM (__gesf2):
+	link	a6,IMM (0)
+	pea	-1
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpsf2_internal)
+	unlk	a6
+	rts
+#endif /* L_gesf2 */
+
+#ifdef  L_ltsf2
+	.text
+	FUNC(__ltsf2)
+	.globl	SYM (__ltsf2)
+SYM (__ltsf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpsf2_internal)
+	unlk	a6
+	rts
+#endif /* L_ltsf2 */
+
+#ifdef  L_lesf2
+	.text
+	FUNC(__lesf2)
+	.globl	SYM (__lesf2)
+SYM (__lesf2):
+	link	a6,IMM (0)
+	pea	1
+	movl	a6@(12),sp@-
+	movl	a6@(8),sp@-
+	PICCALL	SYM (__cmpsf2_internal)
+	unlk	a6
+	rts
+#endif /* L_lesf2 */
+
+#if defined (__ELF__) && defined (__linux__)
+	/* Make stack non-executable for ELF linux targets.  */
+	.section	.note.GNU-stack,"",@progbits
+#endif
diff --git a/libgcc/config/m68k/t-floatlib b/libgcc/config/m68k/t-floatlib
new file mode 100644
index 00000000000..4160eb9f537
--- /dev/null
+++ b/libgcc/config/m68k/t-floatlib
@@ -0,0 +1,5 @@
+LIB1ASMSRC = m68k/lb1sf68.S
+LIB1ASMFUNCS = _mulsi3 _udivsi3 _divsi3 _umodsi3 _modsi3 \
+   _double _float _floatex \
+   _eqdf2 _nedf2 _gtdf2 _gedf2 _ltdf2 _ledf2 \
+   _eqsf2 _nesf2 _gtsf2 _gesf2 _ltsf2 _lesf2
diff --git a/libgcc/config/mcore/lib1funcs.S b/libgcc/config/mcore/lib1funcs.S
new file mode 100644
index 00000000000..701762f2a3c
--- /dev/null
+++ b/libgcc/config/mcore/lib1funcs.S
@@ -0,0 +1,303 @@
+/* libgcc routines for the MCore.
+   Copyright (C) 1993, 1999, 2000, 2009 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#define CONCAT1(a, b) CONCAT2(a, b)
+#define CONCAT2(a, b) a ## b
+
+/* Use the right prefix for global labels.  */
+
+#define SYM(x) CONCAT1 (__, x)
+
+#ifdef __ELF__
+#define TYPE(x) .type SYM (x),@function
+#define SIZE(x) .size SYM (x), . - SYM (x)
+#else
+#define TYPE(x)
+#define SIZE(x)
+#endif
+
+.macro FUNC_START name
+	.text
+	.globl SYM (\name)
+	TYPE (\name)
+SYM (\name):
+.endm
+
+.macro FUNC_END name
+	SIZE (\name)
+.endm
+
+#ifdef	L_udivsi3
+FUNC_START udiv32
+FUNC_START udivsi32
+
+	movi	r1,0		// r1-r2 form 64 bit dividend
+	movi	r4,1		// r4 is quotient (1 for a sentinel)
+
+	cmpnei	r3,0		// look for 0 divisor
+	bt	9f
+	trap	3		// divide by 0
+9:
+	// control iterations; skip across high order 0 bits in dividend
+	mov	r7,r2
+	cmpnei	r7,0
+	bt	8f
+	movi	r2,0		// 0 dividend
+	jmp	r15		// quick return
+8:
+	ff1	r7		// figure distance to skip
+	lsl	r4,r7		// move the sentinel along (with 0's behind)
+	lsl	r2,r7		// and the low 32 bits of numerator
+
+// appears to be wrong...
+// tested out incorrectly in our OS work...
+//	mov	r7,r3		// looking at divisor
+//	ff1	r7		// I can move 32-r7 more bits to left.
+//	addi	r7,1		// ok, one short of that...
+//	mov	r1,r2
+//	lsr	r1,r7		// bits that came from low order...
+//	rsubi	r7,31		// r7 == "32-n" == LEFT distance
+//	addi	r7,1		// this is (32-n)
+//	lsl	r4,r7		// fixes the high 32 (quotient)
+//	lsl	r2,r7
+//	cmpnei	r4,0
+//	bf	4f		// the sentinel went away...
+
+	// run the remaining bits
+
+1:	lslc	r2,1		// 1 bit left shift of r1-r2
+	addc	r1,r1
+	cmphs	r1,r3		// upper 32 of dividend >= divisor?
+	bf	2f
+	sub	r1,r3		// if yes, subtract divisor
+2:	addc	r4,r4		// shift by 1 and count subtracts
+	bf	1b		// if sentinel falls out of quotient, stop
+
+4:	mov	r2,r4		// return quotient
+	mov	r3,r1		// and piggyback the remainder
+	jmp	r15
+FUNC_END udiv32
+FUNC_END udivsi32
+#endif
+
+#ifdef	L_umodsi3
+FUNC_START urem32
+FUNC_START umodsi3
+	movi	r1,0		// r1-r2 form 64 bit dividend
+	movi	r4,1		// r4 is quotient (1 for a sentinel)
+	cmpnei	r3,0		// look for 0 divisor
+	bt	9f
+	trap	3		// divide by 0
+9:
+	// control iterations; skip across high order 0 bits in dividend
+	mov	r7,r2
+	cmpnei	r7,0
+	bt	8f
+	movi	r2,0		// 0 dividend
+	jmp	r15		// quick return
+8:
+	ff1	r7		// figure distance to skip
+	lsl	r4,r7		// move the sentinel along (with 0's behind)
+	lsl	r2,r7		// and the low 32 bits of numerator
+
+1:	lslc	r2,1		// 1 bit left shift of r1-r2
+	addc	r1,r1
+	cmphs	r1,r3		// upper 32 of dividend >= divisor?
+	bf	2f
+	sub	r1,r3		// if yes, subtract divisor
+2:	addc	r4,r4		// shift by 1 and count subtracts
+	bf	1b		// if sentinel falls out of quotient, stop
+	mov	r2,r1		// return remainder
+	jmp	r15
+FUNC_END urem32
+FUNC_END umodsi3
+#endif
+
+#ifdef	L_divsi3
+FUNC_START div32
+FUNC_START divsi3
+	mov	r5,r2		// calc sign of quotient
+	xor	r5,r3
+	abs	r2		// do unsigned divide
+	abs	r3
+	movi	r1,0		// r1-r2 form 64 bit dividend
+	movi	r4,1		// r4 is quotient (1 for a sentinel)
+	cmpnei	r3,0		// look for 0 divisor
+	bt	9f
+	trap	3		// divide by 0
+9:
+	// control iterations; skip across high order 0 bits in dividend
+	mov	r7,r2
+	cmpnei	r7,0
+	bt	8f
+	movi	r2,0		// 0 dividend
+	jmp	r15		// quick return
+8:
+	ff1	r7		// figure distance to skip
+	lsl	r4,r7		// move the sentinel along (with 0's behind)
+	lsl	r2,r7		// and the low 32 bits of numerator
+
+// tested out incorrectly in our OS work...
+//	mov	r7,r3		// looking at divisor
+//	ff1	r7		// I can move 32-r7 more bits to left.
+//	addi	r7,1		// ok, one short of that...
+//	mov	r1,r2
+//	lsr	r1,r7		// bits that came from low order...
+//	rsubi	r7,31		// r7 == "32-n" == LEFT distance
+//	addi	r7,1		// this is (32-n)
+//	lsl	r4,r7		// fixes the high 32 (quotient)
+//	lsl	r2,r7
+//	cmpnei	r4,0
+//	bf	4f		// the sentinel went away...
+
+	// run the remaining bits
+1:	lslc	r2,1		// 1 bit left shift of r1-r2
+	addc	r1,r1
+	cmphs	r1,r3		// upper 32 of dividend >= divisor?
+	bf	2f
+	sub	r1,r3		// if yes, subtract divisor
+2:	addc	r4,r4		// shift by 1 and count subtracts
+	bf	1b		// if sentinel falls out of quotient, stop
+
+4:	mov	r2,r4		// return quotient
+	mov	r3,r1		// piggyback the remainder
+	btsti	r5,31		// after adjusting for sign
+	bf	3f
+	rsubi	r2,0
+	rsubi	r3,0
+3:	jmp	r15
+FUNC_END div32
+FUNC_END divsi3
+#endif
+
+#ifdef	L_modsi3
+FUNC_START rem32
+FUNC_START modsi3
+	mov	r5,r2		// calc sign of remainder
+	abs	r2		// do unsigned divide
+	abs	r3
+	movi	r1,0		// r1-r2 form 64 bit dividend
+	movi	r4,1		// r4 is quotient (1 for a sentinel)
+	cmpnei	r3,0		// look for 0 divisor
+	bt	9f
+	trap	3		// divide by 0
+9: 
+	// control iterations; skip across high order 0 bits in dividend
+	mov	r7,r2
+	cmpnei	r7,0
+	bt	8f
+	movi	r2,0		// 0 dividend
+	jmp	r15		// quick return
+8:
+	ff1	r7		// figure distance to skip
+	lsl	r4,r7		// move the sentinel along (with 0's behind)
+	lsl	r2,r7		// and the low 32 bits of numerator
+
+1:	lslc	r2,1		// 1 bit left shift of r1-r2
+	addc	r1,r1
+	cmphs	r1,r3		// upper 32 of dividend >= divisor?
+	bf	2f
+	sub	r1,r3		// if yes, subtract divisor
+2:	addc	r4,r4		// shift by 1 and count subtracts
+	bf	1b		// if sentinel falls out of quotient, stop
+	mov	r2,r1		// return remainder
+	btsti	r5,31		// after adjusting for sign
+	bf	3f
+	rsubi	r2,0
+3:	jmp	r15
+FUNC_END rem32
+FUNC_END modsi3
+#endif
+
+
+/* GCC expects that {__eq,__ne,__gt,__ge,__le,__lt}{df2,sf2}
+   will behave as __cmpdf2. So, we stub the implementations to
+   jump on to __cmpdf2 and __cmpsf2.
+ 
+   All of these shortcircuit the return path so that __cmp{sd}f2
+   will go directly back to the caller.  */
+
+.macro  COMPARE_DF_JUMP name
+	.import SYM (cmpdf2)
+FUNC_START \name
+	jmpi SYM (cmpdf2)
+FUNC_END \name
+.endm
+		
+#ifdef  L_eqdf2
+COMPARE_DF_JUMP eqdf2
+#endif /* L_eqdf2 */
+
+#ifdef  L_nedf2
+COMPARE_DF_JUMP nedf2
+#endif /* L_nedf2 */
+
+#ifdef  L_gtdf2
+COMPARE_DF_JUMP gtdf2
+#endif /* L_gtdf2 */
+
+#ifdef  L_gedf2
+COMPARE_DF_JUMP gedf2
+#endif /* L_gedf2 */
+
+#ifdef  L_ltdf2
+COMPARE_DF_JUMP ltdf2
+#endif /* L_ltdf2 */
+	
+#ifdef  L_ledf2
+COMPARE_DF_JUMP ledf2
+#endif /* L_ledf2 */
+
+/* SINGLE PRECISION FLOATING POINT STUBS */
+
+.macro  COMPARE_SF_JUMP name
+	.import SYM (cmpsf2)
+FUNC_START \name
+	jmpi SYM (cmpsf2)
+FUNC_END \name
+.endm
+		
+#ifdef  L_eqsf2
+COMPARE_SF_JUMP eqsf2
+#endif /* L_eqsf2 */
+	
+#ifdef  L_nesf2
+COMPARE_SF_JUMP nesf2
+#endif /* L_nesf2 */
+	
+#ifdef  L_gtsf2
+COMPARE_SF_JUMP gtsf2
+#endif /* L_gtsf2 */
+	
+#ifdef  L_gesf2
+COMPARE_SF_JUMP __gesf2
+#endif /* L_gesf2 */
+	
+#ifdef  L_ltsf2
+COMPARE_SF_JUMP __ltsf2
+#endif /* L_ltsf2 */
+	
+#ifdef  L_lesf2
+COMPARE_SF_JUMP lesf2
+#endif /* L_lesf2 */
diff --git a/libgcc/config/mcore/t-mcore b/libgcc/config/mcore/t-mcore
new file mode 100644
index 00000000000..19c4c15cd0b
--- /dev/null
+++ b/libgcc/config/mcore/t-mcore
@@ -0,0 +1,2 @@
+LIB1ASMSRC    = mcore/lib1funcs.S
+LIB1ASMFUNCS  = _divsi3 _udivsi3 _modsi3 _umodsi3
diff --git a/libgcc/config/mep/lib1funcs.S b/libgcc/config/mep/lib1funcs.S
new file mode 100644
index 00000000000..0a18913f927
--- /dev/null
+++ b/libgcc/config/mep/lib1funcs.S
@@ -0,0 +1,125 @@
+/* libgcc routines for Toshiba Media Processor.
+   Copyright (C) 2001, 2002, 2005, 2009 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+  
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+  
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#define SAVEALL \
+	add3	$sp, $sp, -16*4 ; \
+	sw	$0, ($sp) ; \
+	sw	$1, 4($sp) ; \
+	sw	$2, 8($sp) ; \
+	sw	$3, 12($sp) ; \
+	sw	$4, 16($sp) ; \
+	sw	$5, 20($sp) ; \
+	sw	$6, 24($sp) ; \
+	sw	$7, 28($sp) ; \
+	sw	$8, 32($sp) ; \
+	sw	$9, 36($sp) ; \
+	sw	$10, 40($sp) ; \
+	sw	$11, 44($sp) ; \
+	sw	$12, 48($sp) ; \
+	sw	$13, 52($sp) ; \
+	sw	$14, 56($sp) ; \
+	ldc	$5, $lp	; \
+	add	$5, 3 ; \
+	mov	$6, -4 ; \
+	and	$5, $6
+
+#define RESTOREALL \
+	stc	$5, $lp ; \
+	lw	$14, 56($sp) ; \
+	lw	$13, 52($sp) ; \
+	lw	$12, 48($sp) ; \
+	lw	$11, 44($sp) ; \
+	lw	$10, 40($sp) ; \
+	lw	$9, 36($sp) ; \
+	lw	$8, 32($sp) ; \
+	lw	$7, 28($sp) ; \
+	lw	$6, 24($sp) ; \
+	lw	$5, 20($sp) ; \
+	lw	$4, 16($sp) ; \
+	lw	$3, 12($sp) ; \
+	lw	$2, 8($sp) ; \
+	lw	$1, 4($sp) ; \
+	lw	$0, ($sp) ; \
+	add3	$sp, $sp, 16*4 ; \
+	ret
+
+#ifdef L_mep_profile
+	.text
+	.global __mep_mcount
+__mep_mcount:
+	SAVEALL
+	ldc	$1, $lp
+	mov	$2, $0
+	bsr	__mep_mcount_2
+	RESTOREALL
+#endif
+
+#ifdef L_mep_bb_init_trace
+	.text
+	.global __mep_bb_init_trace_func
+__mep_bb_init_trace_func:
+	SAVEALL
+	lw	$1, ($5)
+	lw	$2, 4($5)
+	add	$5, 8
+	bsr	__bb_init_trace_func
+	RESTOREALL
+#endif
+
+#ifdef L_mep_bb_init
+	.text
+	.global __mep_bb_init_func
+__mep_bb_init_func:
+	SAVEALL
+	lw	$1, ($5)
+	add	$5, 4
+	bsr	__bb_init_func
+	RESTOREALL
+#endif
+
+#ifdef L_mep_bb_trace
+	.text
+	.global __mep_bb_trace_func
+__mep_bb_trace_func:
+	SAVEALL
+	movu	$3, __bb
+	lw	$1, ($5)
+	sw	$1, ($3)
+	lw	$2, 4($5)
+	sw	$2, 4($3)
+	add	$5, 8
+	bsr	__bb_trace_func
+	RESTOREALL
+#endif
+
+#ifdef L_mep_bb_increment
+	.text
+	.global __mep_bb_increment_func
+__mep_bb_increment_func:
+	SAVEALL
+	lw	$1, ($5)
+	lw	$0, ($1)
+	add	$0, 1
+	sw	$0, ($1)
+	add	$5, 4
+	RESTOREALL
+#endif
diff --git a/libgcc/config/mep/t-mep b/libgcc/config/mep/t-mep
index 36e6f5dc771..d1fb094a41e 100644
--- a/libgcc/config/mep/t-mep
+++ b/libgcc/config/mep/t-mep
@@ -1,2 +1,11 @@
+# profiling support
+LIB1ASMSRC = mep/lib1funcs.S
+
+LIB1ASMFUNCS = _mep_profile \
+	       _mep_bb_init_trace \
+	       _mep_bb_init \
+	       _mep_bb_trace \
+	       _mep_bb_increment
+
 # Use -O0 instead of -O2 so we don't get complex relocations
 CRTSTUFF_CFLAGS += -O0
diff --git a/libgcc/config/mips/mips16.S b/libgcc/config/mips/mips16.S
new file mode 100644
index 00000000000..ec331b5f65e
--- /dev/null
+++ b/libgcc/config/mips/mips16.S
@@ -0,0 +1,712 @@
+/* mips16 floating point support code
+   Copyright (C) 1996, 1997, 1998, 2008, 2009, 2010
+   Free Software Foundation, Inc.
+   Contributed by Cygnus Support
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* This file contains mips16 floating point support functions.  These
+   functions are called by mips16 code to handle floating point when
+   -msoft-float is not used.  They accept the arguments and return
+   values using the soft-float calling convention, but do the actual
+   operation using the hard floating point instructions.  */
+
+#if defined _MIPS_SIM && (_MIPS_SIM == _ABIO32 || _MIPS_SIM == _ABIO64)
+
+/* This file contains 32-bit assembly code.  */
+	.set nomips16
+
+/* Start a function.  */
+
+#define STARTFN(NAME) .globl NAME; .ent NAME; NAME:
+
+/* Finish a function.  */
+
+#define ENDFN(NAME) .end NAME
+
+/* ARG1
+	The FPR that holds the first floating-point argument.
+
+   ARG2
+	The FPR that holds the second floating-point argument.
+
+   RET
+	The FPR that holds a floating-point return value.  */
+
+#define RET $f0
+#define ARG1 $f12
+#ifdef __mips64
+#define ARG2 $f13
+#else
+#define ARG2 $f14
+#endif
+
+/* Set 64-bit register GPR so that its high 32 bits contain HIGH_FPR
+   and so that its low 32 bits contain LOW_FPR.  */
+#define MERGE_GPRf(GPR, HIGH_FPR, LOW_FPR)	\
+	.set	noat;				\
+	mfc1	$1, LOW_FPR;			\
+	mfc1	GPR, HIGH_FPR;			\
+	dsll	$1, $1, 32;			\
+	dsll	GPR, GPR, 32;			\
+	dsrl	$1, $1, 32;			\
+	or	GPR, GPR, $1;			\
+	.set	at
+
+/* Move the high 32 bits of GPR to HIGH_FPR and the low 32 bits of
+   GPR to LOW_FPR.  */
+#define MERGE_GPRt(GPR, HIGH_FPR, LOW_FPR)	\
+	.set	noat;				\
+	dsrl	$1, GPR, 32;			\
+	mtc1	GPR, LOW_FPR;			\
+	mtc1	$1, HIGH_FPR;			\
+	.set	at
+
+/* Jump to T, and use "OPCODE, OP2" to implement a delayed move.  */
+#define DELAYt(T, OPCODE, OP2)			\
+	.set	noreorder;			\
+	jr	T;				\
+	OPCODE, OP2;				\
+	.set	reorder
+
+/* Use "OPCODE. OP2" and jump to T.  */
+#define DELAYf(T, OPCODE, OP2) OPCODE, OP2; jr T
+
+/* MOVE_SF_BYTE0(D)
+	Move the first single-precision floating-point argument between
+	GPRs and FPRs.
+
+   MOVE_SI_BYTE0(D)
+	Likewise the first single-precision integer argument.
+
+   MOVE_SF_BYTE4(D)
+	Move the second single-precision floating-point argument between
+	GPRs and FPRs, given that the first argument occupies 4 bytes.
+
+   MOVE_SF_BYTE8(D)
+	Move the second single-precision floating-point argument between
+	GPRs and FPRs, given that the first argument occupies 8 bytes.
+
+   MOVE_DF_BYTE0(D)
+	Move the first double-precision floating-point argument between
+	GPRs and FPRs.
+
+   MOVE_DF_BYTE8(D)
+	Likewise the second double-precision floating-point argument.
+
+   MOVE_SF_RET(D, T)
+	Likewise a single-precision floating-point return value,
+	then jump to T.
+
+   MOVE_SC_RET(D, T)
+	Likewise a complex single-precision floating-point return value.
+
+   MOVE_DF_RET(D, T)
+	Likewise a double-precision floating-point return value.
+
+   MOVE_DC_RET(D, T)
+	Likewise a complex double-precision floating-point return value.
+
+   MOVE_SI_RET(D, T)
+	Likewise a single-precision integer return value.
+
+   The D argument is "t" to move to FPRs and "f" to move from FPRs.
+   The return macros may assume that the target of the jump does not
+   use a floating-point register.  */
+
+#define MOVE_SF_RET(D, T) DELAY##D (T, m##D##c1 $2,$f0)
+#define MOVE_SI_RET(D, T) DELAY##D (T, m##D##c1 $2,$f0)
+
+#if defined(__mips64) && defined(__MIPSEB__)
+#define MOVE_SC_RET(D, T) MERGE_GPR##D ($2, $f0, $f1); jr T
+#elif defined(__mips64)
+/* The high 32 bits of $2 correspond to the second word in memory;
+   i.e. the imaginary part.  */
+#define MOVE_SC_RET(D, T) MERGE_GPR##D ($2, $f1, $f0); jr T
+#elif __mips_fpr == 64
+#define MOVE_SC_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##c1 $3,$f1)
+#else
+#define MOVE_SC_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##c1 $3,$f2)
+#endif
+
+#if defined(__mips64)
+#define MOVE_SF_BYTE0(D) m##D##c1 $4,$f12
+#define MOVE_SF_BYTE4(D) m##D##c1 $5,$f13
+#define MOVE_SF_BYTE8(D) m##D##c1 $5,$f13
+#else
+#define MOVE_SF_BYTE0(D) m##D##c1 $4,$f12
+#define MOVE_SF_BYTE4(D) m##D##c1 $5,$f14
+#define MOVE_SF_BYTE8(D) m##D##c1 $6,$f14
+#endif
+#define MOVE_SI_BYTE0(D) MOVE_SF_BYTE0(D)
+
+#if defined(__mips64)
+#define MOVE_DF_BYTE0(D) dm##D##c1 $4,$f12
+#define MOVE_DF_BYTE8(D) dm##D##c1 $5,$f13
+#define MOVE_DF_RET(D, T) DELAY##D (T, dm##D##c1 $2,$f0)
+#define MOVE_DC_RET(D, T) dm##D##c1 $3,$f1; MOVE_DF_RET (D, T)
+#elif __mips_fpr == 64 && defined(__MIPSEB__)
+#define MOVE_DF_BYTE0(D) m##D##c1 $5,$f12; m##D##hc1 $4,$f12
+#define MOVE_DF_BYTE8(D) m##D##c1 $7,$f14; m##D##hc1 $6,$f14
+#define MOVE_DF_RET(D, T) m##D##c1 $3,$f0; DELAY##D (T, m##D##hc1 $2,$f0)
+#define MOVE_DC_RET(D, T) m##D##c1 $5,$f1; m##D##hc1 $4,$f1; MOVE_DF_RET (D, T)
+#elif __mips_fpr == 64
+#define MOVE_DF_BYTE0(D) m##D##c1 $4,$f12; m##D##hc1 $5,$f12
+#define MOVE_DF_BYTE8(D) m##D##c1 $6,$f14; m##D##hc1 $7,$f14
+#define MOVE_DF_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##hc1 $3,$f0)
+#define MOVE_DC_RET(D, T) m##D##c1 $4,$f1; m##D##hc1 $5,$f1; MOVE_DF_RET (D, T)
+#elif defined(__MIPSEB__)
+/* FPRs are little-endian.  */
+#define MOVE_DF_BYTE0(D) m##D##c1 $4,$f13; m##D##c1 $5,$f12
+#define MOVE_DF_BYTE8(D) m##D##c1 $6,$f15; m##D##c1 $7,$f14
+#define MOVE_DF_RET(D, T) m##D##c1 $2,$f1; DELAY##D (T, m##D##c1 $3,$f0)
+#define MOVE_DC_RET(D, T) m##D##c1 $4,$f3; m##D##c1 $5,$f2; MOVE_DF_RET (D, T)
+#else
+#define MOVE_DF_BYTE0(D) m##D##c1 $4,$f12; m##D##c1 $5,$f13
+#define MOVE_DF_BYTE8(D) m##D##c1 $6,$f14; m##D##c1 $7,$f15
+#define MOVE_DF_RET(D, T) m##D##c1 $2,$f0; DELAY##D (T, m##D##c1 $3,$f1)
+#define MOVE_DC_RET(D, T) m##D##c1 $4,$f2; m##D##c1 $5,$f3; MOVE_DF_RET (D, T)
+#endif
+
+/* Single-precision math.  */
+
+/* Define a function NAME that loads two single-precision values,
+   performs FPU operation OPCODE on them, and returns the single-
+   precision result.  */
+
+#define OPSF3(NAME, OPCODE)	\
+STARTFN (NAME);			\
+	MOVE_SF_BYTE0 (t);	\
+	MOVE_SF_BYTE4 (t);	\
+	OPCODE	RET,ARG1,ARG2;	\
+	MOVE_SF_RET (f, $31);	\
+	ENDFN (NAME)
+
+#ifdef L_m16addsf3
+OPSF3 (__mips16_addsf3, add.s)
+#endif
+#ifdef L_m16subsf3
+OPSF3 (__mips16_subsf3, sub.s)
+#endif
+#ifdef L_m16mulsf3
+OPSF3 (__mips16_mulsf3, mul.s)
+#endif
+#ifdef L_m16divsf3
+OPSF3 (__mips16_divsf3, div.s)
+#endif
+
+/* Define a function NAME that loads a single-precision value,
+   performs FPU operation OPCODE on it, and returns the single-
+   precision result.  */
+
+#define OPSF2(NAME, OPCODE)	\
+STARTFN (NAME);			\
+	MOVE_SF_BYTE0 (t);	\
+	OPCODE	RET,ARG1;	\
+	MOVE_SF_RET (f, $31);	\
+	ENDFN (NAME)
+
+#ifdef L_m16negsf2
+OPSF2 (__mips16_negsf2, neg.s)
+#endif
+#ifdef L_m16abssf2
+OPSF2 (__mips16_abssf2, abs.s)
+#endif
+
+/* Single-precision comparisons.  */
+
+/* Define a function NAME that loads two single-precision values,
+   performs floating point comparison OPCODE, and returns TRUE or
+   FALSE depending on the result.  */
+
+#define CMPSF(NAME, OPCODE, TRUE, FALSE)	\
+STARTFN (NAME);					\
+	MOVE_SF_BYTE0 (t);			\
+	MOVE_SF_BYTE4 (t);			\
+	OPCODE	ARG1,ARG2;			\
+	li	$2,TRUE;			\
+	bc1t	1f;				\
+	li	$2,FALSE;			\
+1:;						\
+	j	$31;				\
+	ENDFN (NAME)
+
+/* Like CMPSF, but reverse the comparison operands.  */
+
+#define REVCMPSF(NAME, OPCODE, TRUE, FALSE)	\
+STARTFN (NAME);					\
+	MOVE_SF_BYTE0 (t);			\
+	MOVE_SF_BYTE4 (t);			\
+	OPCODE	ARG2,ARG1;			\
+	li	$2,TRUE;			\
+	bc1t	1f;				\
+	li	$2,FALSE;			\
+1:;						\
+	j	$31;				\
+	ENDFN (NAME)
+
+#ifdef L_m16eqsf2
+CMPSF (__mips16_eqsf2, c.eq.s, 0, 1)
+#endif
+#ifdef L_m16nesf2
+CMPSF (__mips16_nesf2, c.eq.s, 0, 1)
+#endif
+#ifdef L_m16gtsf2
+REVCMPSF (__mips16_gtsf2, c.lt.s, 1, 0)
+#endif
+#ifdef L_m16gesf2
+REVCMPSF (__mips16_gesf2, c.le.s, 0, -1)
+#endif
+#ifdef L_m16lesf2
+CMPSF (__mips16_lesf2, c.le.s, 0, 1)
+#endif
+#ifdef L_m16ltsf2
+CMPSF (__mips16_ltsf2, c.lt.s, -1, 0)
+#endif
+#ifdef L_m16unordsf2
+CMPSF(__mips16_unordsf2, c.un.s, 1, 0)
+#endif
+
+
+/* Single-precision conversions.  */
+
+#ifdef L_m16fltsisf
+STARTFN (__mips16_floatsisf)
+	MOVE_SF_BYTE0 (t)
+	cvt.s.w	RET,ARG1
+	MOVE_SF_RET (f, $31)
+	ENDFN (__mips16_floatsisf)
+#endif
+
+#ifdef L_m16fltunsisf
+STARTFN (__mips16_floatunsisf)
+	.set	noreorder
+	bltz	$4,1f
+	MOVE_SF_BYTE0 (t)
+	.set	reorder
+	cvt.s.w	RET,ARG1
+	MOVE_SF_RET (f, $31)
+1:		
+	and	$2,$4,1
+	srl	$3,$4,1
+	or	$2,$2,$3
+	mtc1	$2,RET
+	cvt.s.w	RET,RET
+	add.s	RET,RET,RET
+	MOVE_SF_RET (f, $31)
+	ENDFN (__mips16_floatunsisf)
+#endif
+	
+#ifdef L_m16fix_truncsfsi
+STARTFN (__mips16_fix_truncsfsi)
+	MOVE_SF_BYTE0 (t)
+	trunc.w.s RET,ARG1,$4
+	MOVE_SI_RET (f, $31)
+	ENDFN (__mips16_fix_truncsfsi)
+#endif
+
+#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT)
+
+/* Double-precision math.  */
+
+/* Define a function NAME that loads two double-precision values,
+   performs FPU operation OPCODE on them, and returns the double-
+   precision result.  */
+
+#define OPDF3(NAME, OPCODE)	\
+STARTFN (NAME);			\
+	MOVE_DF_BYTE0 (t);	\
+	MOVE_DF_BYTE8 (t);	\
+	OPCODE RET,ARG1,ARG2;	\
+	MOVE_DF_RET (f, $31);	\
+	ENDFN (NAME)
+
+#ifdef L_m16adddf3
+OPDF3 (__mips16_adddf3, add.d)
+#endif
+#ifdef L_m16subdf3
+OPDF3 (__mips16_subdf3, sub.d)
+#endif
+#ifdef L_m16muldf3
+OPDF3 (__mips16_muldf3, mul.d)
+#endif
+#ifdef L_m16divdf3
+OPDF3 (__mips16_divdf3, div.d)
+#endif
+
+/* Define a function NAME that loads a double-precision value,
+   performs FPU operation OPCODE on it, and returns the double-
+   precision result.  */
+
+#define OPDF2(NAME, OPCODE)	\
+STARTFN (NAME);			\
+	MOVE_DF_BYTE0 (t);	\
+	OPCODE RET,ARG1;	\
+	MOVE_DF_RET (f, $31);	\
+	ENDFN (NAME)
+
+#ifdef L_m16negdf2
+OPDF2 (__mips16_negdf2, neg.d)
+#endif
+#ifdef L_m16absdf2
+OPDF2 (__mips16_absdf2, abs.d)
+#endif
+
+/* Conversions between single and double precision.  */
+
+#ifdef L_m16extsfdf2
+STARTFN (__mips16_extendsfdf2)
+	MOVE_SF_BYTE0 (t)
+	cvt.d.s	RET,ARG1
+	MOVE_DF_RET (f, $31)
+	ENDFN (__mips16_extendsfdf2)
+#endif
+
+#ifdef L_m16trdfsf2
+STARTFN (__mips16_truncdfsf2)
+	MOVE_DF_BYTE0 (t)
+	cvt.s.d	RET,ARG1
+	MOVE_SF_RET (f, $31)
+	ENDFN (__mips16_truncdfsf2)
+#endif
+
+/* Double-precision comparisons.  */
+
+/* Define a function NAME that loads two double-precision values,
+   performs floating point comparison OPCODE, and returns TRUE or
+   FALSE depending on the result.  */
+
+#define CMPDF(NAME, OPCODE, TRUE, FALSE)	\
+STARTFN (NAME);					\
+	MOVE_DF_BYTE0 (t);			\
+	MOVE_DF_BYTE8 (t);			\
+	OPCODE	ARG1,ARG2;			\
+	li	$2,TRUE;			\
+	bc1t	1f;				\
+	li	$2,FALSE;			\
+1:;						\
+	j	$31;				\
+	ENDFN (NAME)
+
+/* Like CMPDF, but reverse the comparison operands.  */
+
+#define REVCMPDF(NAME, OPCODE, TRUE, FALSE)	\
+STARTFN (NAME);					\
+	MOVE_DF_BYTE0 (t);			\
+	MOVE_DF_BYTE8 (t);			\
+	OPCODE	ARG2,ARG1;			\
+	li	$2,TRUE;			\
+	bc1t	1f;				\
+	li	$2,FALSE;			\
+1:;						\
+	j	$31;				\
+	ENDFN (NAME)
+
+#ifdef L_m16eqdf2
+CMPDF (__mips16_eqdf2, c.eq.d, 0, 1)
+#endif
+#ifdef L_m16nedf2
+CMPDF (__mips16_nedf2, c.eq.d, 0, 1)
+#endif
+#ifdef L_m16gtdf2
+REVCMPDF (__mips16_gtdf2, c.lt.d, 1, 0)
+#endif
+#ifdef L_m16gedf2
+REVCMPDF (__mips16_gedf2, c.le.d, 0, -1)
+#endif
+#ifdef L_m16ledf2
+CMPDF (__mips16_ledf2, c.le.d, 0, 1)
+#endif
+#ifdef L_m16ltdf2
+CMPDF (__mips16_ltdf2, c.lt.d, -1, 0)
+#endif
+#ifdef L_m16unorddf2
+CMPDF(__mips16_unorddf2, c.un.d, 1, 0)
+#endif
+
+/* Double-precision conversions.  */
+
+#ifdef L_m16fltsidf
+STARTFN (__mips16_floatsidf)
+	MOVE_SI_BYTE0 (t)
+	cvt.d.w	RET,ARG1
+	MOVE_DF_RET (f, $31)
+	ENDFN (__mips16_floatsidf)
+#endif
+	
+#ifdef L_m16fltunsidf
+STARTFN (__mips16_floatunsidf)
+	MOVE_SI_BYTE0 (t)
+	cvt.d.w RET,ARG1
+	bgez	$4,1f
+	li.d	ARG1, 4.294967296e+9
+	add.d	RET, RET, ARG1
+1:	MOVE_DF_RET (f, $31)
+	ENDFN (__mips16_floatunsidf)
+#endif
+	
+#ifdef L_m16fix_truncdfsi
+STARTFN (__mips16_fix_truncdfsi)
+	MOVE_DF_BYTE0 (t)
+	trunc.w.d RET,ARG1,$4
+	MOVE_SI_RET (f, $31)
+	ENDFN (__mips16_fix_truncdfsi)
+#endif
+#endif /* !__mips_single_float */
+
+/* Define a function NAME that moves a return value of mode MODE from
+   FPRs to GPRs.  */
+
+#define RET_FUNCTION(NAME, MODE)	\
+STARTFN (NAME);				\
+	MOVE_##MODE##_RET (t, $31);	\
+	ENDFN (NAME)
+
+#ifdef L_m16retsf
+RET_FUNCTION (__mips16_ret_sf, SF)
+#endif
+
+#ifdef L_m16retsc
+RET_FUNCTION (__mips16_ret_sc, SC)
+#endif
+
+#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT)
+#ifdef L_m16retdf
+RET_FUNCTION (__mips16_ret_df, DF)
+#endif
+
+#ifdef L_m16retdc
+RET_FUNCTION (__mips16_ret_dc, DC)
+#endif
+#endif /* !__mips_single_float */
+
+/* STUB_ARGS_X copies the arguments from GPRs to FPRs for argument
+   code X.  X is calculated as ARG1 + ARG2 * 4, where ARG1 and ARG2
+   classify the first and second arguments as follows:
+
+	1: a single-precision argument
+	2: a double-precision argument
+	0: no argument, or not one of the above.  */
+
+#define STUB_ARGS_0						/* () */
+#define STUB_ARGS_1 MOVE_SF_BYTE0 (t)				/* (sf) */
+#define STUB_ARGS_5 MOVE_SF_BYTE0 (t); MOVE_SF_BYTE4 (t)	/* (sf, sf) */
+#define STUB_ARGS_9 MOVE_SF_BYTE0 (t); MOVE_DF_BYTE8 (t)	/* (sf, df) */
+#define STUB_ARGS_2 MOVE_DF_BYTE0 (t)				/* (df) */
+#define STUB_ARGS_6 MOVE_DF_BYTE0 (t); MOVE_SF_BYTE8 (t)	/* (df, sf) */
+#define STUB_ARGS_10 MOVE_DF_BYTE0 (t); MOVE_DF_BYTE8 (t)	/* (df, df) */
+
+/* These functions are used by 16-bit code when calling via a function
+   pointer.  They must copy the floating point arguments from the GPRs
+   to FPRs and then call function $2.  */
+
+#define CALL_STUB_NO_RET(NAME, CODE)	\
+STARTFN (NAME);				\
+	STUB_ARGS_##CODE;		\
+	.set	noreorder;		\
+	jr	$2;			\
+	move	$25,$2;			\
+	.set	reorder;		\
+	ENDFN (NAME)
+
+#ifdef L_m16stub1
+CALL_STUB_NO_RET (__mips16_call_stub_1, 1)
+#endif
+
+#ifdef L_m16stub5
+CALL_STUB_NO_RET (__mips16_call_stub_5, 5)
+#endif
+
+#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT)
+
+#ifdef L_m16stub2
+CALL_STUB_NO_RET (__mips16_call_stub_2, 2)
+#endif
+
+#ifdef L_m16stub6
+CALL_STUB_NO_RET (__mips16_call_stub_6, 6)
+#endif
+
+#ifdef L_m16stub9
+CALL_STUB_NO_RET (__mips16_call_stub_9, 9)
+#endif
+
+#ifdef L_m16stub10
+CALL_STUB_NO_RET (__mips16_call_stub_10, 10)
+#endif
+#endif /* !__mips_single_float */
+
+/* Now we have the same set of functions, except that this time the
+   function being called returns an SFmode, SCmode, DFmode or DCmode
+   value; we need to instantiate a set for each case.  The calling
+   function will arrange to preserve $18, so these functions are free
+   to use it to hold the return address.
+
+   Note that we do not know whether the function we are calling is 16
+   bit or 32 bit.  However, it does not matter, because 16-bit
+   functions always return floating point values in both the gp and
+   the fp regs.  It would be possible to check whether the function
+   being called is 16 bits, in which case the copy is unnecessary;
+   however, it's faster to always do the copy.  */
+
+#define CALL_STUB_RET(NAME, CODE, MODE)	\
+STARTFN (NAME);				\
+	move	$18,$31;		\
+	STUB_ARGS_##CODE;		\
+	.set	noreorder;		\
+	jalr	$2;			\
+	move	$25,$2;			\
+	.set	reorder;		\
+	MOVE_##MODE##_RET (f, $18);	\
+	ENDFN (NAME)
+
+/* First, instantiate the single-float set.  */
+
+#ifdef L_m16stubsf0
+CALL_STUB_RET (__mips16_call_stub_sf_0, 0, SF)
+#endif
+
+#ifdef L_m16stubsf1
+CALL_STUB_RET (__mips16_call_stub_sf_1, 1, SF)
+#endif
+
+#ifdef L_m16stubsf5
+CALL_STUB_RET (__mips16_call_stub_sf_5, 5, SF)
+#endif
+
+#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT)
+#ifdef L_m16stubsf2
+CALL_STUB_RET (__mips16_call_stub_sf_2, 2, SF)
+#endif
+
+#ifdef L_m16stubsf6
+CALL_STUB_RET (__mips16_call_stub_sf_6, 6, SF)
+#endif
+
+#ifdef L_m16stubsf9
+CALL_STUB_RET (__mips16_call_stub_sf_9, 9, SF)
+#endif
+
+#ifdef L_m16stubsf10
+CALL_STUB_RET (__mips16_call_stub_sf_10, 10, SF)
+#endif
+#endif /* !__mips_single_float */
+
+
+/* Now we have the same set of functions again, except that this time
+   the function being called returns an DFmode value.  */
+
+#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT)
+#ifdef L_m16stubdf0
+CALL_STUB_RET (__mips16_call_stub_df_0, 0, DF)
+#endif
+
+#ifdef L_m16stubdf1
+CALL_STUB_RET (__mips16_call_stub_df_1, 1, DF)
+#endif
+
+#ifdef L_m16stubdf5
+CALL_STUB_RET (__mips16_call_stub_df_5, 5, DF)
+#endif
+
+#ifdef L_m16stubdf2
+CALL_STUB_RET (__mips16_call_stub_df_2, 2, DF)
+#endif
+
+#ifdef L_m16stubdf6
+CALL_STUB_RET (__mips16_call_stub_df_6, 6, DF)
+#endif
+
+#ifdef L_m16stubdf9
+CALL_STUB_RET (__mips16_call_stub_df_9, 9, DF)
+#endif
+
+#ifdef L_m16stubdf10
+CALL_STUB_RET (__mips16_call_stub_df_10, 10, DF)
+#endif
+#endif /* !__mips_single_float */
+
+
+/* Ho hum.  Here we have the same set of functions again, this time
+   for when the function being called returns an SCmode value.  */
+
+#ifdef L_m16stubsc0
+CALL_STUB_RET (__mips16_call_stub_sc_0, 0, SC)
+#endif
+
+#ifdef L_m16stubsc1
+CALL_STUB_RET (__mips16_call_stub_sc_1, 1, SC)
+#endif
+
+#ifdef L_m16stubsc5
+CALL_STUB_RET (__mips16_call_stub_sc_5, 5, SC)
+#endif
+
+#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT)
+#ifdef L_m16stubsc2
+CALL_STUB_RET (__mips16_call_stub_sc_2, 2, SC)
+#endif
+
+#ifdef L_m16stubsc6
+CALL_STUB_RET (__mips16_call_stub_sc_6, 6, SC)
+#endif
+
+#ifdef L_m16stubsc9
+CALL_STUB_RET (__mips16_call_stub_sc_9, 9, SC)
+#endif
+
+#ifdef L_m16stubsc10
+CALL_STUB_RET (__mips16_call_stub_sc_10, 10, SC)
+#endif
+#endif /* !__mips_single_float */
+
+
+/* Finally, another set of functions for DCmode.  */
+
+#if !defined(__mips_single_float) && !defined(__SINGLE_FLOAT)
+#ifdef L_m16stubdc0
+CALL_STUB_RET (__mips16_call_stub_dc_0, 0, DC)
+#endif
+
+#ifdef L_m16stubdc1
+CALL_STUB_RET (__mips16_call_stub_dc_1, 1, DC)
+#endif
+
+#ifdef L_m16stubdc5
+CALL_STUB_RET (__mips16_call_stub_dc_5, 5, DC)
+#endif
+
+#ifdef L_m16stubdc2
+CALL_STUB_RET (__mips16_call_stub_dc_2, 2, DC)
+#endif
+
+#ifdef L_m16stubdc6
+CALL_STUB_RET (__mips16_call_stub_dc_6, 6, DC)
+#endif
+
+#ifdef L_m16stubdc9
+CALL_STUB_RET (__mips16_call_stub_dc_9, 9, DC)
+#endif
+
+#ifdef L_m16stubdc10
+CALL_STUB_RET (__mips16_call_stub_dc_10, 10, DC)
+#endif
+#endif /* !__mips_single_float */
+#endif
diff --git a/libgcc/config/mips/t-mips16 b/libgcc/config/mips/t-mips16
index 46c7472f5f6..5553ed76e2d 100644
--- a/libgcc/config/mips/t-mips16
+++ b/libgcc/config/mips/t-mips16
@@ -1,3 +1,43 @@
+# Copyright (C) 2007, 2008, 2011 Free Software Foundation, Inc.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+LIB1ASMSRC = mips/mips16.S
+LIB1ASMFUNCS = _m16addsf3 _m16subsf3 _m16mulsf3 _m16divsf3 \
+	_m16eqsf2 _m16nesf2 _m16gtsf2 _m16gesf2 _m16lesf2 _m16ltsf2 \
+	_m16unordsf2 \
+	_m16fltsisf _m16fix_truncsfsi _m16fltunsisf \
+	_m16adddf3 _m16subdf3 _m16muldf3 _m16divdf3 \
+	_m16extsfdf2 _m16trdfsf2 \
+	_m16eqdf2 _m16nedf2 _m16gtdf2 _m16gedf2 _m16ledf2 _m16ltdf2 \
+	_m16unorddf2 \
+	_m16fltsidf _m16fix_truncdfsi _m16fltunsidf \
+	_m16retsf _m16retdf \
+	_m16retsc _m16retdc \
+	_m16stub1 _m16stub2 _m16stub5 _m16stub6 _m16stub9 _m16stub10 \
+	_m16stubsf0 _m16stubsf1 _m16stubsf2 _m16stubsf5 _m16stubsf6 \
+	_m16stubsf9 _m16stubsf10 \
+	_m16stubdf0 _m16stubdf1 _m16stubdf2 _m16stubdf5 _m16stubdf6 \
+	_m16stubdf9 _m16stubdf10 \
+	_m16stubsc0 _m16stubsc1 _m16stubsc2 _m16stubsc5 _m16stubsc6 \
+	_m16stubsc9 _m16stubsc10 \
+	_m16stubdc0 _m16stubdc1 _m16stubdc2 _m16stubdc5 _m16stubdc6 \
+	_m16stubdc9 _m16stubdc10
+
 SYNC = yes
 SYNC_CFLAGS = -mno-mips16
 
diff --git a/libgcc/config/pa/milli64.S b/libgcc/config/pa/milli64.S
new file mode 100644
index 00000000000..2e9c4f741b6
--- /dev/null
+++ b/libgcc/config/pa/milli64.S
@@ -0,0 +1,2134 @@
+/* 32 and 64-bit millicode, original author Hewlett-Packard
+   adapted for gcc by Paul Bame <bame@debian.org>
+   and Alan Modra <alan@linuxcare.com.au>.
+
+   Copyright 2001, 2002, 2003, 2007, 2009 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifdef pa64
+        .level  2.0w
+#endif
+
+/* Hardware General Registers.  */
+r0:	.reg	%r0
+r1:	.reg	%r1
+r2:	.reg	%r2
+r3:	.reg	%r3
+r4:	.reg	%r4
+r5:	.reg	%r5
+r6:	.reg	%r6
+r7:	.reg	%r7
+r8:	.reg	%r8
+r9:	.reg	%r9
+r10:	.reg	%r10
+r11:	.reg	%r11
+r12:	.reg	%r12
+r13:	.reg	%r13
+r14:	.reg	%r14
+r15:	.reg	%r15
+r16:	.reg	%r16
+r17:	.reg	%r17
+r18:	.reg	%r18
+r19:	.reg	%r19
+r20:	.reg	%r20
+r21:	.reg	%r21
+r22:	.reg	%r22
+r23:	.reg	%r23
+r24:	.reg	%r24
+r25:	.reg	%r25
+r26:	.reg	%r26
+r27:	.reg	%r27
+r28:	.reg	%r28
+r29:	.reg	%r29
+r30:	.reg	%r30
+r31:	.reg	%r31
+
+/* Hardware Space Registers.  */
+sr0:	.reg	%sr0
+sr1:	.reg	%sr1
+sr2:	.reg	%sr2
+sr3:	.reg	%sr3
+sr4:	.reg	%sr4
+sr5:	.reg	%sr5
+sr6:	.reg	%sr6
+sr7:	.reg	%sr7
+
+/* Hardware Floating Point Registers.  */
+fr0:	.reg	%fr0
+fr1:	.reg	%fr1
+fr2:	.reg	%fr2
+fr3:	.reg	%fr3
+fr4:	.reg	%fr4
+fr5:	.reg	%fr5
+fr6:	.reg	%fr6
+fr7:	.reg	%fr7
+fr8:	.reg	%fr8
+fr9:	.reg	%fr9
+fr10:	.reg	%fr10
+fr11:	.reg	%fr11
+fr12:	.reg	%fr12
+fr13:	.reg	%fr13
+fr14:	.reg	%fr14
+fr15:	.reg	%fr15
+
+/* Hardware Control Registers.  */
+cr11:	.reg	%cr11
+sar:	.reg	%cr11	/* Shift Amount Register */
+
+/* Software Architecture General Registers.  */
+rp:	.reg    r2	/* return pointer */
+#ifdef pa64
+mrp:	.reg	r2 	/* millicode return pointer */
+#else
+mrp:	.reg	r31	/* millicode return pointer */
+#endif
+ret0:	.reg    r28	/* return value */
+ret1:	.reg    r29	/* return value (high part of double) */
+sp:	.reg 	r30	/* stack pointer */
+dp:	.reg	r27	/* data pointer */
+arg0:	.reg	r26	/* argument */
+arg1:	.reg	r25	/* argument or high part of double argument */
+arg2:	.reg	r24	/* argument */
+arg3:	.reg	r23	/* argument or high part of double argument */
+
+/* Software Architecture Space Registers.  */
+/* 		sr0	; return link from BLE */
+sret:	.reg	sr1	/* return value */
+sarg:	.reg	sr1	/* argument */
+/* 		sr4	; PC SPACE tracker */
+/* 		sr5	; process private data */
+
+/* Frame Offsets (millicode convention!)  Used when calling other
+   millicode routines.  Stack unwinding is dependent upon these
+   definitions.  */
+r31_slot:	.equ	-20	/* "current RP" slot */
+sr0_slot:	.equ	-16     /* "static link" slot */
+#if defined(pa64)
+mrp_slot:       .equ    -16	/* "current RP" slot */
+psp_slot:       .equ    -8	/* "previous SP" slot */
+#else
+mrp_slot:	.equ	-20     /* "current RP" slot (replacing "r31_slot") */
+#endif
+
+
+#define DEFINE(name,value)name:	.EQU	value
+#define RDEFINE(name,value)name:	.REG	value
+#ifdef milliext
+#define MILLI_BE(lbl)   BE    lbl(sr7,r0)
+#define MILLI_BEN(lbl)  BE,n  lbl(sr7,r0)
+#define MILLI_BLE(lbl)	BLE   lbl(sr7,r0)
+#define MILLI_BLEN(lbl)	BLE,n lbl(sr7,r0)
+#define MILLIRETN	BE,n  0(sr0,mrp)
+#define MILLIRET	BE    0(sr0,mrp)
+#define MILLI_RETN	BE,n  0(sr0,mrp)
+#define MILLI_RET	BE    0(sr0,mrp)
+#else
+#define MILLI_BE(lbl)	B     lbl
+#define MILLI_BEN(lbl)  B,n   lbl
+#define MILLI_BLE(lbl)	BL    lbl,mrp
+#define MILLI_BLEN(lbl)	BL,n  lbl,mrp
+#define MILLIRETN	BV,n  0(mrp)
+#define MILLIRET	BV    0(mrp)
+#define MILLI_RETN	BV,n  0(mrp)
+#define MILLI_RET	BV    0(mrp)
+#endif
+
+#ifdef __STDC__
+#define CAT(a,b)	a##b
+#else
+#define CAT(a,b)	a/**/b
+#endif
+
+#ifdef ELF
+#define SUBSPA_MILLI	 .section .text
+#define SUBSPA_MILLI_DIV .section .text.div,"ax",@progbits! .align 16
+#define SUBSPA_MILLI_MUL .section .text.mul,"ax",@progbits! .align 16
+#define ATTR_MILLI
+#define SUBSPA_DATA	 .section .data
+#define ATTR_DATA
+#define GLOBAL		 $global$
+#define GSYM(sym) 	 !sym:
+#define LSYM(sym)	 !CAT(.L,sym:)
+#define LREF(sym)	 CAT(.L,sym)
+
+#else
+
+#ifdef coff
+/* This used to be .milli but since link32 places different named
+   sections in different segments millicode ends up a long ways away
+   from .text (1meg?).  This way they will be a lot closer.
+
+   The SUBSPA_MILLI_* specify locality sets for certain millicode
+   modules in order to ensure that modules that call one another are
+   placed close together. Without locality sets this is unlikely to
+   happen because of the Dynamite linker library search algorithm. We
+   want these modules close together so that short calls always reach
+   (we don't want to require long calls or use long call stubs).  */
+
+#define SUBSPA_MILLI	 .subspa .text
+#define SUBSPA_MILLI_DIV .subspa .text$dv,align=16
+#define SUBSPA_MILLI_MUL .subspa .text$mu,align=16
+#define ATTR_MILLI	 .attr code,read,execute
+#define SUBSPA_DATA	 .subspa .data
+#define ATTR_DATA	 .attr init_data,read,write
+#define GLOBAL		 _gp
+#else
+#define SUBSPA_MILLI	 .subspa $MILLICODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,SORT=8
+#define SUBSPA_MILLI_DIV SUBSPA_MILLI
+#define SUBSPA_MILLI_MUL SUBSPA_MILLI
+#define ATTR_MILLI
+#define SUBSPA_DATA	 .subspa $BSS$,quad=1,align=8,access=0x1f,sort=80,zero
+#define ATTR_DATA
+#define GLOBAL		 $global$
+#endif
+#define SPACE_DATA	 .space $PRIVATE$,spnum=1,sort=16
+
+#define GSYM(sym)	 !sym
+#define LSYM(sym)	 !CAT(L$,sym)
+#define LREF(sym)	 CAT(L$,sym)
+#endif
+
+#ifdef L_dyncall
+	SUBSPA_MILLI
+	ATTR_DATA
+GSYM($$dyncall)
+	.export $$dyncall,millicode
+	.proc
+	.callinfo	millicode
+	.entry
+	bb,>=,n %r22,30,LREF(1)		; branch if not plabel address
+	depi	0,31,2,%r22		; clear the two least significant bits
+	ldw	4(%r22),%r19		; load new LTP value
+	ldw	0(%r22),%r22		; load address of target
+LSYM(1)
+#ifdef LINUX
+	bv	%r0(%r22)		; branch to the real target
+#else
+	ldsid	(%sr0,%r22),%r1		; get the "space ident" selected by r22
+	mtsp	%r1,%sr0		; move that space identifier into sr0
+	be	0(%sr0,%r22)		; branch to the real target
+#endif
+	stw	%r2,-24(%r30)		; save return address into frame marker
+	.exit
+	.procend
+#endif
+
+#ifdef L_divI
+/* ROUTINES:	$$divI, $$divoI
+
+   Single precision divide for signed binary integers.
+
+   The quotient is truncated towards zero.
+   The sign of the quotient is the XOR of the signs of the dividend and
+   divisor.
+   Divide by zero is trapped.
+   Divide of -2**31 by -1 is trapped for $$divoI but not for $$divI.
+
+   INPUT REGISTERS:
+   .	arg0 ==	dividend
+   .	arg1 ==	divisor
+   .	mrp  == return pc
+   .	sr0  == return space when called externally
+
+   OUTPUT REGISTERS:
+   .	arg0 =	undefined
+   .	arg1 =	undefined
+   .	ret1 =	quotient
+
+   OTHER REGISTERS AFFECTED:
+   .	r1   =	undefined
+
+   SIDE EFFECTS:
+   .	Causes a trap under the following conditions:
+   .		divisor is zero  (traps with ADDIT,=  0,25,0)
+   .		dividend==-2**31  and divisor==-1 and routine is $$divoI
+   .				 (traps with ADDO  26,25,0)
+   .	Changes memory at the following places:
+   .		NONE
+
+   PERMISSIBLE CONTEXT:
+   .	Unwindable.
+   .	Suitable for internal or external millicode.
+   .	Assumes the special millicode register conventions.
+
+   DISCUSSION:
+   .	Branchs to other millicode routines using BE
+   .		$$div_# for # being 2,3,4,5,6,7,8,9,10,12,14,15
+   .
+   .	For selected divisors, calls a divide by constant routine written by
+   .	Karl Pettis.  Eligible divisors are 1..15 excluding 11 and 13.
+   .
+   .	The only overflow case is -2**31 divided by -1.
+   .	Both routines return -2**31 but only $$divoI traps.  */
+
+RDEFINE(temp,r1)
+RDEFINE(retreg,ret1)	/*  r29 */
+RDEFINE(temp1,arg0)
+	SUBSPA_MILLI_DIV
+	ATTR_MILLI
+	.import $$divI_2,millicode
+	.import $$divI_3,millicode
+	.import $$divI_4,millicode
+	.import $$divI_5,millicode
+	.import $$divI_6,millicode
+	.import $$divI_7,millicode
+	.import $$divI_8,millicode
+	.import $$divI_9,millicode
+	.import $$divI_10,millicode
+	.import $$divI_12,millicode
+	.import $$divI_14,millicode
+	.import $$divI_15,millicode
+	.export $$divI,millicode
+	.export	$$divoI,millicode
+	.proc
+	.callinfo	millicode
+	.entry
+GSYM($$divoI)
+	comib,=,n  -1,arg1,LREF(negative1)	/*  when divisor == -1 */
+GSYM($$divI)
+	ldo	-1(arg1),temp		/*  is there at most one bit set ? */
+	and,<>	arg1,temp,r0		/*  if not, don't use power of 2 divide */
+	addi,>	0,arg1,r0		/*  if divisor > 0, use power of 2 divide */
+	b,n	LREF(neg_denom)
+LSYM(pow2)
+	addi,>=	0,arg0,retreg		/*  if numerator is negative, add the */
+	add	arg0,temp,retreg	/*  (denominaotr -1) to correct for shifts */
+	extru,=	arg1,15,16,temp		/*  test denominator with 0xffff0000 */
+	extrs	retreg,15,16,retreg	/*  retreg = retreg >> 16 */
+	or	arg1,temp,arg1		/*  arg1 = arg1 | (arg1 >> 16) */
+	ldi	0xcc,temp1		/*  setup 0xcc in temp1 */
+	extru,= arg1,23,8,temp		/*  test denominator with 0xff00 */
+	extrs	retreg,23,24,retreg	/*  retreg = retreg >> 8 */
+	or	arg1,temp,arg1		/*  arg1 = arg1 | (arg1 >> 8) */
+	ldi	0xaa,temp		/*  setup 0xaa in temp */
+	extru,= arg1,27,4,r0		/*  test denominator with 0xf0 */
+	extrs	retreg,27,28,retreg	/*  retreg = retreg >> 4 */
+	and,=	arg1,temp1,r0		/*  test denominator with 0xcc */
+	extrs	retreg,29,30,retreg	/*  retreg = retreg >> 2 */
+	and,=	arg1,temp,r0		/*  test denominator with 0xaa */
+	extrs	retreg,30,31,retreg	/*  retreg = retreg >> 1 */
+	MILLIRETN
+LSYM(neg_denom)
+	addi,<	0,arg1,r0		/*  if arg1 >= 0, it's not power of 2 */
+	b,n	LREF(regular_seq)
+	sub	r0,arg1,temp		/*  make denominator positive */
+	comb,=,n  arg1,temp,LREF(regular_seq)	/*  test against 0x80000000 and 0 */
+	ldo	-1(temp),retreg		/*  is there at most one bit set ? */
+	and,=	temp,retreg,r0		/*  if so, the denominator is power of 2 */
+	b,n	LREF(regular_seq)
+	sub	r0,arg0,retreg		/*  negate numerator */
+	comb,=,n arg0,retreg,LREF(regular_seq) /*  test against 0x80000000 */
+	copy	retreg,arg0		/*  set up arg0, arg1 and temp	*/
+	copy	temp,arg1		/*  before branching to pow2 */
+	b	LREF(pow2)
+	ldo	-1(arg1),temp
+LSYM(regular_seq)
+	comib,>>=,n 15,arg1,LREF(small_divisor)
+	add,>=	0,arg0,retreg		/*  move dividend, if retreg < 0, */
+LSYM(normal)
+	subi	0,retreg,retreg		/*    make it positive */
+	sub	0,arg1,temp		/*  clear carry,  */
+					/*    negate the divisor */
+	ds	0,temp,0		/*  set V-bit to the comple- */
+					/*    ment of the divisor sign */
+	add	retreg,retreg,retreg	/*  shift msb bit into carry */
+	ds	r0,arg1,temp		/*  1st divide step, if no carry */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  2nd divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  3rd divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  4th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  5th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  6th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  7th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  8th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  9th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  10th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  11th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  12th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  13th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  14th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  15th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  16th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  17th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  18th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  19th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  20th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  21st divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  22nd divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  23rd divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  24th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  25th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  26th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  27th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  28th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  29th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  30th divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  31st divide step */
+	addc	retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds	temp,arg1,temp		/*  32nd divide step, */
+	addc	retreg,retreg,retreg	/*  shift last retreg bit into retreg */
+	xor,>=	arg0,arg1,0		/*  get correct sign of quotient */
+	  sub	0,retreg,retreg		/*    based on operand signs */
+	MILLIRETN
+	nop
+
+LSYM(small_divisor)
+
+#if defined(pa64)
+/*  Clear the upper 32 bits of the arg1 register.  We are working with	*/
+/*  small divisors (and 32-bit integers)   We must not be mislead  */
+/*  by "1" bits left in the upper 32 bits.  */
+	depd %r0,31,32,%r25
+#endif
+	blr,n	arg1,r0
+	nop
+/*  table for divisor == 0,1, ... ,15 */
+	addit,=	0,arg1,r0	/*  trap if divisor == 0 */
+	nop
+	MILLIRET		/*  divisor == 1 */
+	copy	arg0,retreg
+	MILLI_BEN($$divI_2)	/*  divisor == 2 */
+	nop
+	MILLI_BEN($$divI_3)	/*  divisor == 3 */
+	nop
+	MILLI_BEN($$divI_4)	/*  divisor == 4 */
+	nop
+	MILLI_BEN($$divI_5)	/*  divisor == 5 */
+	nop
+	MILLI_BEN($$divI_6)	/*  divisor == 6 */
+	nop
+	MILLI_BEN($$divI_7)	/*  divisor == 7 */
+	nop
+	MILLI_BEN($$divI_8)	/*  divisor == 8 */
+	nop
+	MILLI_BEN($$divI_9)	/*  divisor == 9 */
+	nop
+	MILLI_BEN($$divI_10)	/*  divisor == 10 */
+	nop
+	b	LREF(normal)		/*  divisor == 11 */
+	add,>=	0,arg0,retreg
+	MILLI_BEN($$divI_12)	/*  divisor == 12 */
+	nop
+	b	LREF(normal)		/*  divisor == 13 */
+	add,>=	0,arg0,retreg
+	MILLI_BEN($$divI_14)	/*  divisor == 14 */
+	nop
+	MILLI_BEN($$divI_15)	/*  divisor == 15 */
+	nop
+
+LSYM(negative1)
+	sub	0,arg0,retreg	/*  result is negation of dividend */
+	MILLIRET
+	addo	arg0,arg1,r0	/*  trap iff dividend==0x80000000 && divisor==-1 */
+	.exit
+	.procend
+	.end
+#endif
+
+#ifdef L_divU
+/* ROUTINE:	$$divU
+   .
+   .	Single precision divide for unsigned integers.
+   .
+   .	Quotient is truncated towards zero.
+   .	Traps on divide by zero.
+
+   INPUT REGISTERS:
+   .	arg0 ==	dividend
+   .	arg1 ==	divisor
+   .	mrp  == return pc
+   .	sr0  == return space when called externally
+
+   OUTPUT REGISTERS:
+   .	arg0 =	undefined
+   .	arg1 =	undefined
+   .	ret1 =	quotient
+
+   OTHER REGISTERS AFFECTED:
+   .	r1   =	undefined
+
+   SIDE EFFECTS:
+   .	Causes a trap under the following conditions:
+   .		divisor is zero
+   .	Changes memory at the following places:
+   .		NONE
+
+   PERMISSIBLE CONTEXT:
+   .	Unwindable.
+   .	Does not create a stack frame.
+   .	Suitable for internal or external millicode.
+   .	Assumes the special millicode register conventions.
+
+   DISCUSSION:
+   .	Branchs to other millicode routines using BE:
+   .		$$divU_# for 3,5,6,7,9,10,12,14,15
+   .
+   .	For selected small divisors calls the special divide by constant
+   .	routines written by Karl Pettis.  These are: 3,5,6,7,9,10,12,14,15.  */
+
+RDEFINE(temp,r1)
+RDEFINE(retreg,ret1)	/* r29 */
+RDEFINE(temp1,arg0)
+	SUBSPA_MILLI_DIV
+	ATTR_MILLI
+	.export $$divU,millicode
+	.import $$divU_3,millicode
+	.import $$divU_5,millicode
+	.import $$divU_6,millicode
+	.import $$divU_7,millicode
+	.import $$divU_9,millicode
+	.import $$divU_10,millicode
+	.import $$divU_12,millicode
+	.import $$divU_14,millicode
+	.import $$divU_15,millicode
+	.proc
+	.callinfo	millicode
+	.entry
+GSYM($$divU)
+/* The subtract is not nullified since it does no harm and can be used
+   by the two cases that branch back to "normal".  */
+	ldo	-1(arg1),temp		/* is there at most one bit set ? */
+	and,=	arg1,temp,r0		/* if so, denominator is power of 2 */
+	b	LREF(regular_seq)
+	addit,=	0,arg1,0		/* trap for zero dvr */
+	copy	arg0,retreg
+	extru,= arg1,15,16,temp		/* test denominator with 0xffff0000 */
+	extru	retreg,15,16,retreg	/* retreg = retreg >> 16 */
+	or	arg1,temp,arg1		/* arg1 = arg1 | (arg1 >> 16) */
+	ldi	0xcc,temp1		/* setup 0xcc in temp1 */
+	extru,= arg1,23,8,temp		/* test denominator with 0xff00 */
+	extru	retreg,23,24,retreg	/* retreg = retreg >> 8 */
+	or	arg1,temp,arg1		/* arg1 = arg1 | (arg1 >> 8) */
+	ldi	0xaa,temp		/* setup 0xaa in temp */
+	extru,= arg1,27,4,r0		/* test denominator with 0xf0 */
+	extru	retreg,27,28,retreg	/* retreg = retreg >> 4 */
+	and,=	arg1,temp1,r0		/* test denominator with 0xcc */
+	extru	retreg,29,30,retreg	/* retreg = retreg >> 2 */
+	and,=	arg1,temp,r0		/* test denominator with 0xaa */
+	extru	retreg,30,31,retreg	/* retreg = retreg >> 1 */
+	MILLIRETN
+	nop	
+LSYM(regular_seq)
+	comib,>=  15,arg1,LREF(special_divisor)
+	subi	0,arg1,temp		/* clear carry, negate the divisor */
+	ds	r0,temp,r0		/* set V-bit to 1 */
+LSYM(normal)
+	add	arg0,arg0,retreg	/* shift msb bit into carry */
+	ds	r0,arg1,temp		/* 1st divide step, if no carry */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 2nd divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 3rd divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 4th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 5th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 6th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 7th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 8th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 9th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 10th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 11th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 12th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 13th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 14th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 15th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 16th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 17th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 18th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 19th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 20th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 21st divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 22nd divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 23rd divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 24th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 25th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 26th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 27th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 28th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 29th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 30th divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 31st divide step */
+	addc	retreg,retreg,retreg	/* shift retreg with/into carry */
+	ds	temp,arg1,temp		/* 32nd divide step, */
+	MILLIRET
+	addc	retreg,retreg,retreg	/* shift last retreg bit into retreg */
+
+/* Handle the cases where divisor is a small constant or has high bit on.  */
+LSYM(special_divisor)
+/*	blr	arg1,r0 */
+/*	comib,>,n  0,arg1,LREF(big_divisor) ; nullify previous instruction */
+
+/* Pratap 8/13/90. The 815 Stirling chip set has a bug that prevents us from
+   generating such a blr, comib sequence. A problem in nullification. So I
+   rewrote this code.  */
+
+#if defined(pa64)
+/* Clear the upper 32 bits of the arg1 register.  We are working with
+   small divisors (and 32-bit unsigned integers)   We must not be mislead
+   by "1" bits left in the upper 32 bits.  */
+	depd %r0,31,32,%r25
+#endif
+	comib,>	0,arg1,LREF(big_divisor)
+	nop
+	blr	arg1,r0
+	nop
+
+LSYM(zero_divisor)	/* this label is here to provide external visibility */
+	addit,=	0,arg1,0		/* trap for zero dvr */
+	nop
+	MILLIRET			/* divisor == 1 */
+	copy	arg0,retreg
+	MILLIRET			/* divisor == 2 */
+	extru	arg0,30,31,retreg
+	MILLI_BEN($$divU_3)		/* divisor == 3 */
+	nop
+	MILLIRET			/* divisor == 4 */
+	extru	arg0,29,30,retreg
+	MILLI_BEN($$divU_5)		/* divisor == 5 */
+	nop
+	MILLI_BEN($$divU_6)		/* divisor == 6 */
+	nop
+	MILLI_BEN($$divU_7)		/* divisor == 7 */
+	nop
+	MILLIRET			/* divisor == 8 */
+	extru	arg0,28,29,retreg
+	MILLI_BEN($$divU_9)		/* divisor == 9 */
+	nop
+	MILLI_BEN($$divU_10)		/* divisor == 10 */
+	nop
+	b	LREF(normal)		/* divisor == 11 */
+	ds	r0,temp,r0		/* set V-bit to 1 */
+	MILLI_BEN($$divU_12)		/* divisor == 12 */
+	nop
+	b	LREF(normal)		/* divisor == 13 */
+	ds	r0,temp,r0		/* set V-bit to 1 */
+	MILLI_BEN($$divU_14)		/* divisor == 14 */
+	nop
+	MILLI_BEN($$divU_15)		/* divisor == 15 */
+	nop
+
+/* Handle the case where the high bit is on in the divisor.
+   Compute:	if( dividend>=divisor) quotient=1; else quotient=0;
+   Note:	dividend>==divisor iff dividend-divisor does not borrow
+   and		not borrow iff carry.  */
+LSYM(big_divisor)
+	sub	arg0,arg1,r0
+	MILLIRET
+	addc	r0,r0,retreg
+	.exit
+	.procend
+	.end
+#endif
+
+#ifdef L_remI
+/* ROUTINE:	$$remI
+
+   DESCRIPTION:
+   .	$$remI returns the remainder of the division of two signed 32-bit
+   .	integers.  The sign of the remainder is the same as the sign of
+   .	the dividend.
+
+
+   INPUT REGISTERS:
+   .	arg0 == dividend
+   .	arg1 == divisor
+   .	mrp  == return pc
+   .	sr0  == return space when called externally
+
+   OUTPUT REGISTERS:
+   .	arg0 = destroyed
+   .	arg1 = destroyed
+   .	ret1 = remainder
+
+   OTHER REGISTERS AFFECTED:
+   .	r1   = undefined
+
+   SIDE EFFECTS:
+   .	Causes a trap under the following conditions:  DIVIDE BY ZERO
+   .	Changes memory at the following places:  NONE
+
+   PERMISSIBLE CONTEXT:
+   .	Unwindable
+   .	Does not create a stack frame
+   .	Is usable for internal or external microcode
+
+   DISCUSSION:
+   .	Calls other millicode routines via mrp:  NONE
+   .	Calls other millicode routines:  NONE  */
+
+RDEFINE(tmp,r1)
+RDEFINE(retreg,ret1)
+
+	SUBSPA_MILLI
+	ATTR_MILLI
+	.proc
+	.callinfo millicode
+	.entry
+GSYM($$remI)
+GSYM($$remoI)
+	.export $$remI,MILLICODE
+	.export $$remoI,MILLICODE
+	ldo		-1(arg1),tmp		/*  is there at most one bit set ? */
+	and,<>		arg1,tmp,r0		/*  if not, don't use power of 2 */
+	addi,>		0,arg1,r0		/*  if denominator > 0, use power */
+						/*  of 2 */
+	b,n		LREF(neg_denom)
+LSYM(pow2)
+	comb,>,n	0,arg0,LREF(neg_num)	/*  is numerator < 0 ? */
+	and		arg0,tmp,retreg		/*  get the result */
+	MILLIRETN
+LSYM(neg_num)
+	subi		0,arg0,arg0		/*  negate numerator */
+	and		arg0,tmp,retreg		/*  get the result */
+	subi		0,retreg,retreg		/*  negate result */
+	MILLIRETN
+LSYM(neg_denom)
+	addi,<		0,arg1,r0		/*  if arg1 >= 0, it's not power */
+						/*  of 2 */
+	b,n		LREF(regular_seq)
+	sub		r0,arg1,tmp		/*  make denominator positive */
+	comb,=,n	arg1,tmp,LREF(regular_seq) /*  test against 0x80000000 and 0 */
+	ldo		-1(tmp),retreg		/*  is there at most one bit set ? */
+	and,=		tmp,retreg,r0		/*  if not, go to regular_seq */
+	b,n		LREF(regular_seq)
+	comb,>,n	0,arg0,LREF(neg_num_2)	/*  if arg0 < 0, negate it  */
+	and		arg0,retreg,retreg
+	MILLIRETN
+LSYM(neg_num_2)
+	subi		0,arg0,tmp		/*  test against 0x80000000 */
+	and		tmp,retreg,retreg
+	subi		0,retreg,retreg
+	MILLIRETN
+LSYM(regular_seq)
+	addit,=		0,arg1,0		/*  trap if div by zero */
+	add,>=		0,arg0,retreg		/*  move dividend, if retreg < 0, */
+	sub		0,retreg,retreg		/*    make it positive */
+	sub		0,arg1, tmp		/*  clear carry,  */
+						/*    negate the divisor */
+	ds		0, tmp,0		/*  set V-bit to the comple- */
+						/*    ment of the divisor sign */
+	or		0,0, tmp		/*  clear  tmp */
+	add		retreg,retreg,retreg	/*  shift msb bit into carry */
+	ds		 tmp,arg1, tmp		/*  1st divide step, if no carry */
+						/*    out, msb of quotient = 0 */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+LSYM(t1)
+	ds		 tmp,arg1, tmp		/*  2nd divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  3rd divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  4th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  5th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  6th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  7th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  8th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  9th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  10th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  11th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  12th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  13th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  14th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  15th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  16th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  17th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  18th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  19th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  20th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  21st divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  22nd divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  23rd divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  24th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  25th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  26th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  27th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  28th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  29th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  30th divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  31st divide step */
+	addc		retreg,retreg,retreg	/*  shift retreg with/into carry */
+	ds		 tmp,arg1, tmp		/*  32nd divide step, */
+	addc		retreg,retreg,retreg	/*  shift last bit into retreg */
+	movb,>=,n	 tmp,retreg,LREF(finish) /*  branch if pos.  tmp */
+	add,<		arg1,0,0		/*  if arg1 > 0, add arg1 */
+	add,tr		 tmp,arg1,retreg	/*    for correcting remainder tmp */
+	sub		 tmp,arg1,retreg	/*  else add absolute value arg1 */
+LSYM(finish)
+	add,>=		arg0,0,0		/*  set sign of remainder */
+	sub		0,retreg,retreg		/*    to sign of dividend */
+	MILLIRET
+	nop
+	.exit
+	.procend
+#ifdef milliext
+	.origin 0x00000200
+#endif
+	.end
+#endif
+
+#ifdef L_remU
+/* ROUTINE:	$$remU
+   .	Single precision divide for remainder with unsigned binary integers.
+   .
+   .	The remainder must be dividend-(dividend/divisor)*divisor.
+   .	Divide by zero is trapped.
+
+   INPUT REGISTERS:
+   .	arg0 ==	dividend
+   .	arg1 == divisor
+   .	mrp  == return pc
+   .	sr0  == return space when called externally
+
+   OUTPUT REGISTERS:
+   .	arg0 =	undefined
+   .	arg1 =	undefined
+   .	ret1 =	remainder
+
+   OTHER REGISTERS AFFECTED:
+   .	r1   =	undefined
+
+   SIDE EFFECTS:
+   .	Causes a trap under the following conditions:  DIVIDE BY ZERO
+   .	Changes memory at the following places:  NONE
+
+   PERMISSIBLE CONTEXT:
+   .	Unwindable.
+   .	Does not create a stack frame.
+   .	Suitable for internal or external millicode.
+   .	Assumes the special millicode register conventions.
+
+   DISCUSSION:
+   .	Calls other millicode routines using mrp: NONE
+   .	Calls other millicode routines: NONE  */
+
+
+RDEFINE(temp,r1)
+RDEFINE(rmndr,ret1)	/*  r29 */
+	SUBSPA_MILLI
+	ATTR_MILLI
+	.export $$remU,millicode
+	.proc
+	.callinfo	millicode
+	.entry
+GSYM($$remU)
+	ldo	-1(arg1),temp		/*  is there at most one bit set ? */
+	and,=	arg1,temp,r0		/*  if not, don't use power of 2 */
+	b	LREF(regular_seq)
+	addit,=	0,arg1,r0		/*  trap on div by zero */
+	and	arg0,temp,rmndr		/*  get the result for power of 2 */
+	MILLIRETN
+LSYM(regular_seq)
+	comib,>=,n  0,arg1,LREF(special_case)
+	subi	0,arg1,rmndr		/*  clear carry, negate the divisor */
+	ds	r0,rmndr,r0		/*  set V-bit to 1 */
+	add	arg0,arg0,temp		/*  shift msb bit into carry */
+	ds	r0,arg1,rmndr		/*  1st divide step, if no carry */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  2nd divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  3rd divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  4th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  5th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  6th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  7th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  8th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  9th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  10th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  11th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  12th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  13th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  14th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  15th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  16th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  17th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  18th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  19th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  20th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  21st divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  22nd divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  23rd divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  24th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  25th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  26th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  27th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  28th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  29th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  30th divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  31st divide step */
+	addc	temp,temp,temp		/*  shift temp with/into carry */
+	ds	rmndr,arg1,rmndr		/*  32nd divide step, */
+	comiclr,<= 0,rmndr,r0
+	  add	rmndr,arg1,rmndr	/*  correction */
+	MILLIRETN
+	nop
+
+/* Putting >= on the last DS and deleting COMICLR does not work!  */
+LSYM(special_case)
+	sub,>>=	arg0,arg1,rmndr
+	  copy	arg0,rmndr
+	MILLIRETN
+	nop
+	.exit
+	.procend
+	.end
+#endif
+
+#ifdef L_div_const
+/* ROUTINE:	$$divI_2
+   .		$$divI_3	$$divU_3
+   .		$$divI_4
+   .		$$divI_5	$$divU_5
+   .		$$divI_6	$$divU_6
+   .		$$divI_7	$$divU_7
+   .		$$divI_8
+   .		$$divI_9	$$divU_9
+   .		$$divI_10	$$divU_10
+   .
+   .		$$divI_12	$$divU_12
+   .
+   .		$$divI_14	$$divU_14
+   .		$$divI_15	$$divU_15
+   .		$$divI_16
+   .		$$divI_17	$$divU_17
+   .
+   .	Divide by selected constants for single precision binary integers.
+
+   INPUT REGISTERS:
+   .	arg0 ==	dividend
+   .	mrp  == return pc
+   .	sr0  == return space when called externally
+
+   OUTPUT REGISTERS:
+   .	arg0 =	undefined
+   .	arg1 =	undefined
+   .	ret1 =	quotient
+
+   OTHER REGISTERS AFFECTED:
+   .	r1   =	undefined
+
+   SIDE EFFECTS:
+   .	Causes a trap under the following conditions: NONE
+   .	Changes memory at the following places:  NONE
+
+   PERMISSIBLE CONTEXT:
+   .	Unwindable.
+   .	Does not create a stack frame.
+   .	Suitable for internal or external millicode.
+   .	Assumes the special millicode register conventions.
+
+   DISCUSSION:
+   .	Calls other millicode routines using mrp:  NONE
+   .	Calls other millicode routines:  NONE  */
+
+
+/* TRUNCATED DIVISION BY SMALL INTEGERS
+
+   We are interested in q(x) = floor(x/y), where x >= 0 and y > 0
+   (with y fixed).
+
+   Let a = floor(z/y), for some choice of z.  Note that z will be
+   chosen so that division by z is cheap.
+
+   Let r be the remainder(z/y).  In other words, r = z - ay.
+
+   Now, our method is to choose a value for b such that
+
+   q'(x) = floor((ax+b)/z)
+
+   is equal to q(x) over as large a range of x as possible.  If the
+   two are equal over a sufficiently large range, and if it is easy to
+   form the product (ax), and it is easy to divide by z, then we can
+   perform the division much faster than the general division algorithm.
+
+   So, we want the following to be true:
+
+   .	For x in the following range:
+   .
+   .	    ky <= x < (k+1)y
+   .
+   .	implies that
+   .
+   .	    k <= (ax+b)/z < (k+1)
+
+   We want to determine b such that this is true for all k in the
+   range {0..K} for some maximum K.
+
+   Since (ax+b) is an increasing function of x, we can take each
+   bound separately to determine the "best" value for b.
+
+   (ax+b)/z < (k+1)	       implies
+
+   (a((k+1)y-1)+b < (k+1)z     implies
+
+   b < a + (k+1)(z-ay)	       implies
+
+   b < a + (k+1)r
+
+   This needs to be true for all k in the range {0..K}.  In
+   particular, it is true for k = 0 and this leads to a maximum
+   acceptable value for b.
+
+   b < a+r   or   b <= a+r-1
+
+   Taking the other bound, we have
+
+   k <= (ax+b)/z	       implies
+
+   k <= (aky+b)/z	       implies
+
+   k(z-ay) <= b		       implies
+
+   kr <= b
+
+   Clearly, the largest range for k will be achieved by maximizing b,
+   when r is not zero.	When r is zero, then the simplest choice for b
+   is 0.  When r is not 0, set
+
+   .	b = a+r-1
+
+   Now, by construction, q'(x) = floor((ax+b)/z) = q(x) = floor(x/y)
+   for all x in the range:
+
+   .	0 <= x < (K+1)y
+
+   We need to determine what K is.  Of our two bounds,
+
+   .	b < a+(k+1)r	is satisfied for all k >= 0, by construction.
+
+   The other bound is
+
+   .	kr <= b
+
+   This is always true if r = 0.  If r is not 0 (the usual case), then
+   K = floor((a+r-1)/r), is the maximum value for k.
+
+   Therefore, the formula q'(x) = floor((ax+b)/z) yields the correct
+   answer for q(x) = floor(x/y) when x is in the range
+
+   (0,(K+1)y-1)	       K = floor((a+r-1)/r)
+
+   To be most useful, we want (K+1)y-1 = (max x) >= 2**32-1 so that
+   the formula for q'(x) yields the correct value of q(x) for all x
+   representable by a single word in HPPA.
+
+   We are also constrained in that computing the product (ax), adding
+   b, and dividing by z must all be done quickly, otherwise we will be
+   better off going through the general algorithm using the DS
+   instruction, which uses approximately 70 cycles.
+
+   For each y, there is a choice of z which satisfies the constraints
+   for (K+1)y >= 2**32.  We may not, however, be able to satisfy the
+   timing constraints for arbitrary y.	It seems that z being equal to
+   a power of 2 or a power of 2 minus 1 is as good as we can do, since
+   it minimizes the time to do division by z.  We want the choice of z
+   to also result in a value for (a) that minimizes the computation of
+   the product (ax).  This is best achieved if (a) has a regular bit
+   pattern (so the multiplication can be done with shifts and adds).
+   The value of (a) also needs to be less than 2**32 so the product is
+   always guaranteed to fit in 2 words.
+
+   In actual practice, the following should be done:
+
+   1) For negative x, you should take the absolute value and remember
+   .  the fact so that the result can be negated.  This obviously does
+   .  not apply in the unsigned case.
+   2) For even y, you should factor out the power of 2 that divides y
+   .  and divide x by it.  You can then proceed by dividing by the
+   .  odd factor of y.
+
+   Here is a table of some odd values of y, and corresponding choices
+   for z which are "good".
+
+    y	  z	  r	 a (hex)     max x (hex)
+
+    3	2**32	  1	55555555      100000001
+    5	2**32	  1	33333333      100000003
+    7  2**24-1	  0	  249249     (infinite)
+    9  2**24-1	  0	  1c71c7     (infinite)
+   11  2**20-1	  0	   1745d     (infinite)
+   13  2**24-1	  0	  13b13b     (infinite)
+   15	2**32	  1	11111111      10000000d
+   17	2**32	  1	 f0f0f0f      10000000f
+
+   If r is 1, then b = a+r-1 = a.  This simplifies the computation
+   of (ax+b), since you can compute (x+1)(a) instead.  If r is 0,
+   then b = 0 is ok to use which simplifies (ax+b).
+
+   The bit patterns for 55555555, 33333333, and 11111111 are obviously
+   very regular.  The bit patterns for the other values of a above are:
+
+    y	   (hex)	  (binary)
+
+    7	  249249  001001001001001001001001  << regular >>
+    9	  1c71c7  000111000111000111000111  << regular >>
+   11	   1745d  000000010111010001011101  << irregular >>
+   13	  13b13b  000100111011000100111011  << irregular >>
+
+   The bit patterns for (a) corresponding to (y) of 11 and 13 may be
+   too irregular to warrant using this method.
+
+   When z is a power of 2 minus 1, then the division by z is slightly
+   more complicated, involving an iterative solution.
+
+   The code presented here solves division by 1 through 17, except for
+   11 and 13. There are algorithms for both signed and unsigned
+   quantities given.
+
+   TIMINGS (cycles)
+
+   divisor  positive  negative	unsigned
+
+   .   1	2	   2	     2
+   .   2	4	   4	     2
+   .   3       19	  21	    19
+   .   4	4	   4	     2
+   .   5       18	  22	    19
+   .   6       19	  22	    19
+   .   8	4	   4	     2
+   .  10       18	  19	    17
+   .  12       18	  20	    18
+   .  15       16	  18	    16
+   .  16	4	   4	     2
+   .  17       16	  18	    16
+
+   Now, the algorithm for 7, 9, and 14 is an iterative one.  That is,
+   a loop body is executed until the tentative quotient is 0.  The
+   number of times the loop body is executed varies depending on the
+   dividend, but is never more than two times.	If the dividend is
+   less than the divisor, then the loop body is not executed at all.
+   Each iteration adds 4 cycles to the timings.
+
+   divisor  positive  negative	unsigned
+
+   .   7       19+4n	 20+4n	   20+4n    n = number of iterations
+   .   9       21+4n	 22+4n	   21+4n
+   .  14       21+4n	 22+4n	   20+4n
+
+   To give an idea of how the number of iterations varies, here is a
+   table of dividend versus number of iterations when dividing by 7.
+
+   smallest	 largest       required
+   dividend	dividend      iterations
+
+   .	0	     6		    0
+   .	7	 0x6ffffff	    1
+   0x1000006	0xffffffff	    2
+
+   There is some overlap in the range of numbers requiring 1 and 2
+   iterations.	*/
+
+RDEFINE(t2,r1)
+RDEFINE(x2,arg0)	/*  r26 */
+RDEFINE(t1,arg1)	/*  r25 */
+RDEFINE(x1,ret1)	/*  r29 */
+
+	SUBSPA_MILLI_DIV
+	ATTR_MILLI
+
+	.proc
+	.callinfo	millicode
+	.entry
+/* NONE of these routines require a stack frame
+   ALL of these routines are unwindable from millicode	*/
+
+GSYM($$divide_by_constant)
+	.export $$divide_by_constant,millicode
+/*  Provides a "nice" label for the code covered by the unwind descriptor
+    for things like gprof.  */
+
+/* DIVISION BY 2 (shift by 1) */
+GSYM($$divI_2)
+	.export		$$divI_2,millicode
+	comclr,>=	arg0,0,0
+	addi		1,arg0,arg0
+	MILLIRET
+	extrs		arg0,30,31,ret1
+
+
+/* DIVISION BY 4 (shift by 2) */
+GSYM($$divI_4)
+	.export		$$divI_4,millicode
+	comclr,>=	arg0,0,0
+	addi		3,arg0,arg0
+	MILLIRET
+	extrs		arg0,29,30,ret1
+
+
+/* DIVISION BY 8 (shift by 3) */
+GSYM($$divI_8)
+	.export		$$divI_8,millicode
+	comclr,>=	arg0,0,0
+	addi		7,arg0,arg0
+	MILLIRET
+	extrs		arg0,28,29,ret1
+
+/* DIVISION BY 16 (shift by 4) */
+GSYM($$divI_16)
+	.export		$$divI_16,millicode
+	comclr,>=	arg0,0,0
+	addi		15,arg0,arg0
+	MILLIRET
+	extrs		arg0,27,28,ret1
+
+/****************************************************************************
+*
+*	DIVISION BY DIVISORS OF FFFFFFFF, and powers of 2 times these
+*
+*	includes 3,5,15,17 and also 6,10,12
+*
+****************************************************************************/
+
+/* DIVISION BY 3 (use z = 2**32; a = 55555555) */
+
+GSYM($$divI_3)
+	.export		$$divI_3,millicode
+	comb,<,N	x2,0,LREF(neg3)
+
+	addi		1,x2,x2		/* this cannot overflow	*/
+	extru		x2,1,2,x1	/* multiply by 5 to get started */
+	sh2add		x2,x2,x2
+	b		LREF(pos)
+	addc		x1,0,x1
+
+LSYM(neg3)
+	subi		1,x2,x2		/* this cannot overflow	*/
+	extru		x2,1,2,x1	/* multiply by 5 to get started */
+	sh2add		x2,x2,x2
+	b		LREF(neg)
+	addc		x1,0,x1
+
+GSYM($$divU_3)
+	.export		$$divU_3,millicode
+	addi		1,x2,x2		/* this CAN overflow */
+	addc		0,0,x1
+	shd		x1,x2,30,t1	/* multiply by 5 to get started */
+	sh2add		x2,x2,x2
+	b		LREF(pos)
+	addc		x1,t1,x1
+
+/* DIVISION BY 5 (use z = 2**32; a = 33333333) */
+
+GSYM($$divI_5)
+	.export		$$divI_5,millicode
+	comb,<,N	x2,0,LREF(neg5)
+
+	addi		3,x2,t1		/* this cannot overflow	*/
+	sh1add		x2,t1,x2	/* multiply by 3 to get started */
+	b		LREF(pos)
+	addc		0,0,x1
+
+LSYM(neg5)
+	sub		0,x2,x2		/* negate x2			*/
+	addi		1,x2,x2		/* this cannot overflow	*/
+	shd		0,x2,31,x1	/* get top bit (can be 1)	*/
+	sh1add		x2,x2,x2	/* multiply by 3 to get started */
+	b		LREF(neg)
+	addc		x1,0,x1
+
+GSYM($$divU_5)
+	.export		$$divU_5,millicode
+	addi		1,x2,x2		/* this CAN overflow */
+	addc		0,0,x1
+	shd		x1,x2,31,t1	/* multiply by 3 to get started */
+	sh1add		x2,x2,x2
+	b		LREF(pos)
+	addc		t1,x1,x1
+
+/* DIVISION BY	6 (shift to divide by 2 then divide by 3) */
+GSYM($$divI_6)
+	.export		$$divI_6,millicode
+	comb,<,N	x2,0,LREF(neg6)
+	extru		x2,30,31,x2	/* divide by 2			*/
+	addi		5,x2,t1		/* compute 5*(x2+1) = 5*x2+5	*/
+	sh2add		x2,t1,x2	/* multiply by 5 to get started */
+	b		LREF(pos)
+	addc		0,0,x1
+
+LSYM(neg6)
+	subi		2,x2,x2		/* negate, divide by 2, and add 1 */
+					/* negation and adding 1 are done */
+					/* at the same time by the SUBI   */
+	extru		x2,30,31,x2
+	shd		0,x2,30,x1
+	sh2add		x2,x2,x2	/* multiply by 5 to get started */
+	b		LREF(neg)
+	addc		x1,0,x1
+
+GSYM($$divU_6)
+	.export		$$divU_6,millicode
+	extru		x2,30,31,x2	/* divide by 2 */
+	addi		1,x2,x2		/* cannot carry */
+	shd		0,x2,30,x1	/* multiply by 5 to get started */
+	sh2add		x2,x2,x2
+	b		LREF(pos)
+	addc		x1,0,x1
+
+/* DIVISION BY 10 (shift to divide by 2 then divide by 5) */
+GSYM($$divU_10)
+	.export		$$divU_10,millicode
+	extru		x2,30,31,x2	/* divide by 2 */
+	addi		3,x2,t1		/* compute 3*(x2+1) = (3*x2)+3	*/
+	sh1add		x2,t1,x2	/* multiply by 3 to get started */
+	addc		0,0,x1
+LSYM(pos)
+	shd		x1,x2,28,t1	/* multiply by 0x11 */
+	shd		x2,0,28,t2
+	add		x2,t2,x2
+	addc		x1,t1,x1
+LSYM(pos_for_17)
+	shd		x1,x2,24,t1	/* multiply by 0x101 */
+	shd		x2,0,24,t2
+	add		x2,t2,x2
+	addc		x1,t1,x1
+
+	shd		x1,x2,16,t1	/* multiply by 0x10001 */
+	shd		x2,0,16,t2
+	add		x2,t2,x2
+	MILLIRET
+	addc		x1,t1,x1
+
+GSYM($$divI_10)
+	.export		$$divI_10,millicode
+	comb,<		x2,0,LREF(neg10)
+	copy		0,x1
+	extru		x2,30,31,x2	/* divide by 2 */
+	addib,TR	1,x2,LREF(pos)	/* add 1 (cannot overflow)     */
+	sh1add		x2,x2,x2	/* multiply by 3 to get started */
+
+LSYM(neg10)
+	subi		2,x2,x2		/* negate, divide by 2, and add 1 */
+					/* negation and adding 1 are done */
+					/* at the same time by the SUBI   */
+	extru		x2,30,31,x2
+	sh1add		x2,x2,x2	/* multiply by 3 to get started */
+LSYM(neg)
+	shd		x1,x2,28,t1	/* multiply by 0x11 */
+	shd		x2,0,28,t2
+	add		x2,t2,x2
+	addc		x1,t1,x1
+LSYM(neg_for_17)
+	shd		x1,x2,24,t1	/* multiply by 0x101 */
+	shd		x2,0,24,t2
+	add		x2,t2,x2
+	addc		x1,t1,x1
+
+	shd		x1,x2,16,t1	/* multiply by 0x10001 */
+	shd		x2,0,16,t2
+	add		x2,t2,x2
+	addc		x1,t1,x1
+	MILLIRET
+	sub		0,x1,x1
+
+/* DIVISION BY 12 (shift to divide by 4 then divide by 3) */
+GSYM($$divI_12)
+	.export		$$divI_12,millicode
+	comb,<		x2,0,LREF(neg12)
+	copy		0,x1
+	extru		x2,29,30,x2	/* divide by 4			*/
+	addib,tr	1,x2,LREF(pos)	/* compute 5*(x2+1) = 5*x2+5    */
+	sh2add		x2,x2,x2	/* multiply by 5 to get started */
+
+LSYM(neg12)
+	subi		4,x2,x2		/* negate, divide by 4, and add 1 */
+					/* negation and adding 1 are done */
+					/* at the same time by the SUBI   */
+	extru		x2,29,30,x2
+	b		LREF(neg)
+	sh2add		x2,x2,x2	/* multiply by 5 to get started */
+
+GSYM($$divU_12)
+	.export		$$divU_12,millicode
+	extru		x2,29,30,x2	/* divide by 4   */
+	addi		5,x2,t1		/* cannot carry */
+	sh2add		x2,t1,x2	/* multiply by 5 to get started */
+	b		LREF(pos)
+	addc		0,0,x1
+
+/* DIVISION BY 15 (use z = 2**32; a = 11111111) */
+GSYM($$divI_15)
+	.export		$$divI_15,millicode
+	comb,<		x2,0,LREF(neg15)
+	copy		0,x1
+	addib,tr	1,x2,LREF(pos)+4
+	shd		x1,x2,28,t1
+
+LSYM(neg15)
+	b		LREF(neg)
+	subi		1,x2,x2
+
+GSYM($$divU_15)
+	.export		$$divU_15,millicode
+	addi		1,x2,x2		/* this CAN overflow */
+	b		LREF(pos)
+	addc		0,0,x1
+
+/* DIVISION BY 17 (use z = 2**32; a =  f0f0f0f) */
+GSYM($$divI_17)
+	.export		$$divI_17,millicode
+	comb,<,n	x2,0,LREF(neg17)
+	addi		1,x2,x2		/* this cannot overflow */
+	shd		0,x2,28,t1	/* multiply by 0xf to get started */
+	shd		x2,0,28,t2
+	sub		t2,x2,x2
+	b		LREF(pos_for_17)
+	subb		t1,0,x1
+
+LSYM(neg17)
+	subi		1,x2,x2		/* this cannot overflow */
+	shd		0,x2,28,t1	/* multiply by 0xf to get started */
+	shd		x2,0,28,t2
+	sub		t2,x2,x2
+	b		LREF(neg_for_17)
+	subb		t1,0,x1
+
+GSYM($$divU_17)
+	.export		$$divU_17,millicode
+	addi		1,x2,x2		/* this CAN overflow */
+	addc		0,0,x1
+	shd		x1,x2,28,t1	/* multiply by 0xf to get started */
+LSYM(u17)
+	shd		x2,0,28,t2
+	sub		t2,x2,x2
+	b		LREF(pos_for_17)
+	subb		t1,x1,x1
+
+
+/* DIVISION BY DIVISORS OF FFFFFF, and powers of 2 times these
+   includes 7,9 and also 14
+
+
+   z = 2**24-1
+   r = z mod x = 0
+
+   so choose b = 0
+
+   Also, in order to divide by z = 2**24-1, we approximate by dividing
+   by (z+1) = 2**24 (which is easy), and then correcting.
+
+   (ax) = (z+1)q' + r
+   .	= zq' + (q'+r)
+
+   So to compute (ax)/z, compute q' = (ax)/(z+1) and r = (ax) mod (z+1)
+   Then the true remainder of (ax)/z is (q'+r).  Repeat the process
+   with this new remainder, adding the tentative quotients together,
+   until a tentative quotient is 0 (and then we are done).  There is
+   one last correction to be done.  It is possible that (q'+r) = z.
+   If so, then (q'+r)/(z+1) = 0 and it looks like we are done.	But,
+   in fact, we need to add 1 more to the quotient.  Now, it turns
+   out that this happens if and only if the original value x is
+   an exact multiple of y.  So, to avoid a three instruction test at
+   the end, instead use 1 instruction to add 1 to x at the beginning.  */
+
+/* DIVISION BY 7 (use z = 2**24-1; a = 249249) */
+GSYM($$divI_7)
+	.export		$$divI_7,millicode
+	comb,<,n	x2,0,LREF(neg7)
+LSYM(7)
+	addi		1,x2,x2		/* cannot overflow */
+	shd		0,x2,29,x1
+	sh3add		x2,x2,x2
+	addc		x1,0,x1
+LSYM(pos7)
+	shd		x1,x2,26,t1
+	shd		x2,0,26,t2
+	add		x2,t2,x2
+	addc		x1,t1,x1
+
+	shd		x1,x2,20,t1
+	shd		x2,0,20,t2
+	add		x2,t2,x2
+	addc		x1,t1,t1
+
+	/* computed <t1,x2>.  Now divide it by (2**24 - 1)	*/
+
+	copy		0,x1
+	shd,=		t1,x2,24,t1	/* tentative quotient  */
+LSYM(1)
+	addb,tr		t1,x1,LREF(2)	/* add to previous quotient   */
+	extru		x2,31,24,x2	/* new remainder (unadjusted) */
+
+	MILLIRETN
+
+LSYM(2)
+	addb,tr		t1,x2,LREF(1)	/* adjust remainder */
+	extru,=		x2,7,8,t1	/* new quotient     */
+
+LSYM(neg7)
+	subi		1,x2,x2		/* negate x2 and add 1 */
+LSYM(8)
+	shd		0,x2,29,x1
+	sh3add		x2,x2,x2
+	addc		x1,0,x1
+
+LSYM(neg7_shift)
+	shd		x1,x2,26,t1
+	shd		x2,0,26,t2
+	add		x2,t2,x2
+	addc		x1,t1,x1
+
+	shd		x1,x2,20,t1
+	shd		x2,0,20,t2
+	add		x2,t2,x2
+	addc		x1,t1,t1
+
+	/* computed <t1,x2>.  Now divide it by (2**24 - 1)	*/
+
+	copy		0,x1
+	shd,=		t1,x2,24,t1	/* tentative quotient  */
+LSYM(3)
+	addb,tr		t1,x1,LREF(4)	/* add to previous quotient   */
+	extru		x2,31,24,x2	/* new remainder (unadjusted) */
+
+	MILLIRET
+	sub		0,x1,x1		/* negate result    */
+
+LSYM(4)
+	addb,tr		t1,x2,LREF(3)	/* adjust remainder */
+	extru,=		x2,7,8,t1	/* new quotient     */
+
+GSYM($$divU_7)
+	.export		$$divU_7,millicode
+	addi		1,x2,x2		/* can carry */
+	addc		0,0,x1
+	shd		x1,x2,29,t1
+	sh3add		x2,x2,x2
+	b		LREF(pos7)
+	addc		t1,x1,x1
+
+/* DIVISION BY 9 (use z = 2**24-1; a = 1c71c7) */
+GSYM($$divI_9)
+	.export		$$divI_9,millicode
+	comb,<,n	x2,0,LREF(neg9)
+	addi		1,x2,x2		/* cannot overflow */
+	shd		0,x2,29,t1
+	shd		x2,0,29,t2
+	sub		t2,x2,x2
+	b		LREF(pos7)
+	subb		t1,0,x1
+
+LSYM(neg9)
+	subi		1,x2,x2		/* negate and add 1 */
+	shd		0,x2,29,t1
+	shd		x2,0,29,t2
+	sub		t2,x2,x2
+	b		LREF(neg7_shift)
+	subb		t1,0,x1
+
+GSYM($$divU_9)
+	.export		$$divU_9,millicode
+	addi		1,x2,x2		/* can carry */
+	addc		0,0,x1
+	shd		x1,x2,29,t1
+	shd		x2,0,29,t2
+	sub		t2,x2,x2
+	b		LREF(pos7)
+	subb		t1,x1,x1
+
+/* DIVISION BY 14 (shift to divide by 2 then divide by 7) */
+GSYM($$divI_14)
+	.export		$$divI_14,millicode
+	comb,<,n	x2,0,LREF(neg14)
+GSYM($$divU_14)
+	.export		$$divU_14,millicode
+	b		LREF(7)		/* go to 7 case */
+	extru		x2,30,31,x2	/* divide by 2  */
+
+LSYM(neg14)
+	subi		2,x2,x2		/* negate (and add 2) */
+	b		LREF(8)
+	extru		x2,30,31,x2	/* divide by 2	      */
+	.exit
+	.procend
+	.end
+#endif
+
+#ifdef L_mulI
+/* VERSION "@(#)$$mulI $ Revision: 12.4 $ $ Date: 94/03/17 17:18:51 $" */
+/******************************************************************************
+This routine is used on PA2.0 processors when gcc -mno-fpregs is used
+
+ROUTINE:	$$mulI
+
+
+DESCRIPTION:	
+
+	$$mulI multiplies two single word integers, giving a single 
+	word result.  
+
+
+INPUT REGISTERS:
+
+	arg0 = Operand 1
+	arg1 = Operand 2
+	r31  == return pc
+	sr0  == return space when called externally 
+
+
+OUTPUT REGISTERS:
+
+	arg0 = undefined
+	arg1 = undefined
+	ret1 = result 
+
+OTHER REGISTERS AFFECTED:
+
+	r1   = undefined
+
+SIDE EFFECTS:
+
+	Causes a trap under the following conditions:  NONE
+	Changes memory at the following places:  NONE
+
+PERMISSIBLE CONTEXT:
+
+	Unwindable
+	Does not create a stack frame
+	Is usable for internal or external microcode
+
+DISCUSSION:
+
+	Calls other millicode routines via mrp:  NONE
+	Calls other millicode routines:  NONE
+
+***************************************************************************/
+
+
+#define	a0	%arg0
+#define	a1	%arg1
+#define	t0	%r1
+#define	r	%ret1
+
+#define	a0__128a0	zdep	a0,24,25,a0
+#define	a0__256a0	zdep	a0,23,24,a0
+#define	a1_ne_0_b_l0	comb,<>	a1,0,LREF(l0)
+#define	a1_ne_0_b_l1	comb,<>	a1,0,LREF(l1)
+#define	a1_ne_0_b_l2	comb,<>	a1,0,LREF(l2)
+#define	b_n_ret_t0	b,n	LREF(ret_t0)
+#define	b_e_shift	b	LREF(e_shift)
+#define	b_e_t0ma0	b	LREF(e_t0ma0)
+#define	b_e_t0		b	LREF(e_t0)
+#define	b_e_t0a0	b	LREF(e_t0a0)
+#define	b_e_t02a0	b	LREF(e_t02a0)
+#define	b_e_t04a0	b	LREF(e_t04a0)
+#define	b_e_2t0		b	LREF(e_2t0)
+#define	b_e_2t0a0	b	LREF(e_2t0a0)
+#define	b_e_2t04a0	b	LREF(e2t04a0)
+#define	b_e_3t0		b	LREF(e_3t0)
+#define	b_e_4t0		b	LREF(e_4t0)
+#define	b_e_4t0a0	b	LREF(e_4t0a0)
+#define	b_e_4t08a0	b	LREF(e4t08a0)
+#define	b_e_5t0		b	LREF(e_5t0)
+#define	b_e_8t0		b	LREF(e_8t0)
+#define	b_e_8t0a0	b	LREF(e_8t0a0)
+#define	r__r_a0		add	r,a0,r
+#define	r__r_2a0	sh1add	a0,r,r
+#define	r__r_4a0	sh2add	a0,r,r
+#define	r__r_8a0	sh3add	a0,r,r
+#define	r__r_t0		add	r,t0,r
+#define	r__r_2t0	sh1add	t0,r,r
+#define	r__r_4t0	sh2add	t0,r,r
+#define	r__r_8t0	sh3add	t0,r,r
+#define	t0__3a0		sh1add	a0,a0,t0
+#define	t0__4a0		sh2add	a0,0,t0
+#define	t0__5a0		sh2add	a0,a0,t0
+#define	t0__8a0		sh3add	a0,0,t0
+#define	t0__9a0		sh3add	a0,a0,t0
+#define	t0__16a0	zdep	a0,27,28,t0
+#define	t0__32a0	zdep	a0,26,27,t0
+#define	t0__64a0	zdep	a0,25,26,t0
+#define	t0__128a0	zdep	a0,24,25,t0
+#define	t0__t0ma0	sub	t0,a0,t0
+#define	t0__t0_a0	add	t0,a0,t0
+#define	t0__t0_2a0	sh1add	a0,t0,t0
+#define	t0__t0_4a0	sh2add	a0,t0,t0
+#define	t0__t0_8a0	sh3add	a0,t0,t0
+#define	t0__2t0_a0	sh1add	t0,a0,t0
+#define	t0__3t0		sh1add	t0,t0,t0
+#define	t0__4t0		sh2add	t0,0,t0
+#define	t0__4t0_a0	sh2add	t0,a0,t0
+#define	t0__5t0		sh2add	t0,t0,t0
+#define	t0__8t0		sh3add	t0,0,t0
+#define	t0__8t0_a0	sh3add	t0,a0,t0
+#define	t0__9t0		sh3add	t0,t0,t0
+#define	t0__16t0	zdep	t0,27,28,t0
+#define	t0__32t0	zdep	t0,26,27,t0
+#define	t0__256a0	zdep	a0,23,24,t0
+
+
+	SUBSPA_MILLI
+	ATTR_MILLI
+	.align 16
+	.proc
+	.callinfo millicode
+	.export $$mulI,millicode
+GSYM($$mulI)	
+	combt,<<=	a1,a0,LREF(l4)	/* swap args if unsigned a1>a0 */
+	copy		0,r		/* zero out the result */
+	xor		a0,a1,a0	/* swap a0 & a1 using the */
+	xor		a0,a1,a1	/*  old xor trick */
+	xor		a0,a1,a0
+LSYM(l4)
+	combt,<=	0,a0,LREF(l3)		/* if a0>=0 then proceed like unsigned */
+	zdep		a1,30,8,t0	/* t0 = (a1&0xff)<<1 ********* */
+	sub,>		0,a1,t0		/* otherwise negate both and */
+	combt,<=,n	a0,t0,LREF(l2)	/*  swap back if |a0|<|a1| */
+	sub		0,a0,a1
+	movb,tr,n	t0,a0,LREF(l2)	/* 10th inst.  */
+
+LSYM(l0)	r__r_t0				/* add in this partial product */
+LSYM(l1)	a0__256a0			/* a0 <<= 8 ****************** */
+LSYM(l2)	zdep		a1,30,8,t0	/* t0 = (a1&0xff)<<1 ********* */
+LSYM(l3)	blr		t0,0		/* case on these 8 bits ****** */
+		extru		a1,23,24,a1	/* a1 >>= 8 ****************** */
+
+/*16 insts before this.  */
+/*			  a0 <<= 8 ************************** */
+LSYM(x0)	a1_ne_0_b_l2	! a0__256a0	! MILLIRETN	! nop
+LSYM(x1)	a1_ne_0_b_l1	! r__r_a0	! MILLIRETN	! nop
+LSYM(x2)	a1_ne_0_b_l1	! r__r_2a0	! MILLIRETN	! nop
+LSYM(x3)	a1_ne_0_b_l0	! t0__3a0	! MILLIRET	! r__r_t0
+LSYM(x4)	a1_ne_0_b_l1	! r__r_4a0	! MILLIRETN	! nop
+LSYM(x5)	a1_ne_0_b_l0	! t0__5a0	! MILLIRET	! r__r_t0
+LSYM(x6)	t0__3a0		! a1_ne_0_b_l1	! r__r_2t0	! MILLIRETN
+LSYM(x7)	t0__3a0		! a1_ne_0_b_l0	! r__r_4a0	! b_n_ret_t0
+LSYM(x8)	a1_ne_0_b_l1	! r__r_8a0	! MILLIRETN	! nop
+LSYM(x9)	a1_ne_0_b_l0	! t0__9a0	! MILLIRET	! r__r_t0
+LSYM(x10)	t0__5a0		! a1_ne_0_b_l1	! r__r_2t0	! MILLIRETN
+LSYM(x11)	t0__3a0		! a1_ne_0_b_l0	! r__r_8a0	! b_n_ret_t0
+LSYM(x12)	t0__3a0		! a1_ne_0_b_l1	! r__r_4t0	! MILLIRETN
+LSYM(x13)	t0__5a0		! a1_ne_0_b_l0	! r__r_8a0	! b_n_ret_t0
+LSYM(x14)	t0__3a0		! t0__2t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x15)	t0__5a0		! a1_ne_0_b_l0	! t0__3t0	! b_n_ret_t0
+LSYM(x16)	t0__16a0	! a1_ne_0_b_l1	! r__r_t0	! MILLIRETN
+LSYM(x17)	t0__9a0		! a1_ne_0_b_l0	! t0__t0_8a0	! b_n_ret_t0
+LSYM(x18)	t0__9a0		! a1_ne_0_b_l1	! r__r_2t0	! MILLIRETN
+LSYM(x19)	t0__9a0		! a1_ne_0_b_l0	! t0__2t0_a0	! b_n_ret_t0
+LSYM(x20)	t0__5a0		! a1_ne_0_b_l1	! r__r_4t0	! MILLIRETN
+LSYM(x21)	t0__5a0		! a1_ne_0_b_l0	! t0__4t0_a0	! b_n_ret_t0
+LSYM(x22)	t0__5a0		! t0__2t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x23)	t0__5a0		! t0__2t0_a0	! b_e_t0	! t0__2t0_a0
+LSYM(x24)	t0__3a0		! a1_ne_0_b_l1	! r__r_8t0	! MILLIRETN
+LSYM(x25)	t0__5a0		! a1_ne_0_b_l0	! t0__5t0	! b_n_ret_t0
+LSYM(x26)	t0__3a0		! t0__4t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x27)	t0__3a0		! a1_ne_0_b_l0	! t0__9t0	! b_n_ret_t0
+LSYM(x28)	t0__3a0		! t0__2t0_a0	! b_e_shift	! r__r_4t0
+LSYM(x29)	t0__3a0		! t0__2t0_a0	! b_e_t0	! t0__4t0_a0
+LSYM(x30)	t0__5a0		! t0__3t0	! b_e_shift	! r__r_2t0
+LSYM(x31)	t0__32a0	! a1_ne_0_b_l0	! t0__t0ma0	! b_n_ret_t0
+LSYM(x32)	t0__32a0	! a1_ne_0_b_l1	! r__r_t0	! MILLIRETN
+LSYM(x33)	t0__8a0		! a1_ne_0_b_l0	! t0__4t0_a0	! b_n_ret_t0
+LSYM(x34)	t0__16a0	! t0__t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x35)	t0__9a0		! t0__3t0	! b_e_t0	! t0__t0_8a0
+LSYM(x36)	t0__9a0		! a1_ne_0_b_l1	! r__r_4t0	! MILLIRETN
+LSYM(x37)	t0__9a0		! a1_ne_0_b_l0	! t0__4t0_a0	! b_n_ret_t0
+LSYM(x38)	t0__9a0		! t0__2t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x39)	t0__9a0		! t0__2t0_a0	! b_e_t0	! t0__2t0_a0
+LSYM(x40)	t0__5a0		! a1_ne_0_b_l1	! r__r_8t0	! MILLIRETN
+LSYM(x41)	t0__5a0		! a1_ne_0_b_l0	! t0__8t0_a0	! b_n_ret_t0
+LSYM(x42)	t0__5a0		! t0__4t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x43)	t0__5a0		! t0__4t0_a0	! b_e_t0	! t0__2t0_a0
+LSYM(x44)	t0__5a0		! t0__2t0_a0	! b_e_shift	! r__r_4t0
+LSYM(x45)	t0__9a0		! a1_ne_0_b_l0	! t0__5t0	! b_n_ret_t0
+LSYM(x46)	t0__9a0		! t0__5t0	! b_e_t0	! t0__t0_a0
+LSYM(x47)	t0__9a0		! t0__5t0	! b_e_t0	! t0__t0_2a0
+LSYM(x48)	t0__3a0		! a1_ne_0_b_l0	! t0__16t0	! b_n_ret_t0
+LSYM(x49)	t0__9a0		! t0__5t0	! b_e_t0	! t0__t0_4a0
+LSYM(x50)	t0__5a0		! t0__5t0	! b_e_shift	! r__r_2t0
+LSYM(x51)	t0__9a0		! t0__t0_8a0	! b_e_t0	! t0__3t0
+LSYM(x52)	t0__3a0		! t0__4t0_a0	! b_e_shift	! r__r_4t0
+LSYM(x53)	t0__3a0		! t0__4t0_a0	! b_e_t0	! t0__4t0_a0
+LSYM(x54)	t0__9a0		! t0__3t0	! b_e_shift	! r__r_2t0
+LSYM(x55)	t0__9a0		! t0__3t0	! b_e_t0	! t0__2t0_a0
+LSYM(x56)	t0__3a0		! t0__2t0_a0	! b_e_shift	! r__r_8t0
+LSYM(x57)	t0__9a0		! t0__2t0_a0	! b_e_t0	! t0__3t0
+LSYM(x58)	t0__3a0		! t0__2t0_a0	! b_e_2t0	! t0__4t0_a0
+LSYM(x59)	t0__9a0		! t0__2t0_a0	! b_e_t02a0	! t0__3t0
+LSYM(x60)	t0__5a0		! t0__3t0	! b_e_shift	! r__r_4t0
+LSYM(x61)	t0__5a0		! t0__3t0	! b_e_t0	! t0__4t0_a0
+LSYM(x62)	t0__32a0	! t0__t0ma0	! b_e_shift	! r__r_2t0
+LSYM(x63)	t0__64a0	! a1_ne_0_b_l0	! t0__t0ma0	! b_n_ret_t0
+LSYM(x64)	t0__64a0	! a1_ne_0_b_l1	! r__r_t0	! MILLIRETN
+LSYM(x65)	t0__8a0		! a1_ne_0_b_l0	! t0__8t0_a0	! b_n_ret_t0
+LSYM(x66)	t0__32a0	! t0__t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x67)	t0__8a0		! t0__4t0_a0	! b_e_t0	! t0__2t0_a0
+LSYM(x68)	t0__8a0		! t0__2t0_a0	! b_e_shift	! r__r_4t0
+LSYM(x69)	t0__8a0		! t0__2t0_a0	! b_e_t0	! t0__4t0_a0
+LSYM(x70)	t0__64a0	! t0__t0_4a0	! b_e_t0	! t0__t0_2a0
+LSYM(x71)	t0__9a0		! t0__8t0	! b_e_t0	! t0__t0ma0
+LSYM(x72)	t0__9a0		! a1_ne_0_b_l1	! r__r_8t0	! MILLIRETN
+LSYM(x73)	t0__9a0		! t0__8t0_a0	! b_e_shift	! r__r_t0
+LSYM(x74)	t0__9a0		! t0__4t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x75)	t0__9a0		! t0__4t0_a0	! b_e_t0	! t0__2t0_a0
+LSYM(x76)	t0__9a0		! t0__2t0_a0	! b_e_shift	! r__r_4t0
+LSYM(x77)	t0__9a0		! t0__2t0_a0	! b_e_t0	! t0__4t0_a0
+LSYM(x78)	t0__9a0		! t0__2t0_a0	! b_e_2t0	! t0__2t0_a0
+LSYM(x79)	t0__16a0	! t0__5t0	! b_e_t0	! t0__t0ma0
+LSYM(x80)	t0__16a0	! t0__5t0	! b_e_shift	! r__r_t0
+LSYM(x81)	t0__9a0		! t0__9t0	! b_e_shift	! r__r_t0
+LSYM(x82)	t0__5a0		! t0__8t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x83)	t0__5a0		! t0__8t0_a0	! b_e_t0	! t0__2t0_a0
+LSYM(x84)	t0__5a0		! t0__4t0_a0	! b_e_shift	! r__r_4t0
+LSYM(x85)	t0__8a0		! t0__2t0_a0	! b_e_t0	! t0__5t0
+LSYM(x86)	t0__5a0		! t0__4t0_a0	! b_e_2t0	! t0__2t0_a0
+LSYM(x87)	t0__9a0		! t0__9t0	! b_e_t02a0	! t0__t0_4a0
+LSYM(x88)	t0__5a0		! t0__2t0_a0	! b_e_shift	! r__r_8t0
+LSYM(x89)	t0__5a0		! t0__2t0_a0	! b_e_t0	! t0__8t0_a0
+LSYM(x90)	t0__9a0		! t0__5t0	! b_e_shift	! r__r_2t0
+LSYM(x91)	t0__9a0		! t0__5t0	! b_e_t0	! t0__2t0_a0
+LSYM(x92)	t0__5a0		! t0__2t0_a0	! b_e_4t0	! t0__2t0_a0
+LSYM(x93)	t0__32a0	! t0__t0ma0	! b_e_t0	! t0__3t0
+LSYM(x94)	t0__9a0		! t0__5t0	! b_e_2t0	! t0__t0_2a0
+LSYM(x95)	t0__9a0		! t0__2t0_a0	! b_e_t0	! t0__5t0
+LSYM(x96)	t0__8a0		! t0__3t0	! b_e_shift	! r__r_4t0
+LSYM(x97)	t0__8a0		! t0__3t0	! b_e_t0	! t0__4t0_a0
+LSYM(x98)	t0__32a0	! t0__3t0	! b_e_t0	! t0__t0_2a0
+LSYM(x99)	t0__8a0		! t0__4t0_a0	! b_e_t0	! t0__3t0
+LSYM(x100)	t0__5a0		! t0__5t0	! b_e_shift	! r__r_4t0
+LSYM(x101)	t0__5a0		! t0__5t0	! b_e_t0	! t0__4t0_a0
+LSYM(x102)	t0__32a0	! t0__t0_2a0	! b_e_t0	! t0__3t0
+LSYM(x103)	t0__5a0		! t0__5t0	! b_e_t02a0	! t0__4t0_a0
+LSYM(x104)	t0__3a0		! t0__4t0_a0	! b_e_shift	! r__r_8t0
+LSYM(x105)	t0__5a0		! t0__4t0_a0	! b_e_t0	! t0__5t0
+LSYM(x106)	t0__3a0		! t0__4t0_a0	! b_e_2t0	! t0__4t0_a0
+LSYM(x107)	t0__9a0		! t0__t0_4a0	! b_e_t02a0	! t0__8t0_a0
+LSYM(x108)	t0__9a0		! t0__3t0	! b_e_shift	! r__r_4t0
+LSYM(x109)	t0__9a0		! t0__3t0	! b_e_t0	! t0__4t0_a0
+LSYM(x110)	t0__9a0		! t0__3t0	! b_e_2t0	! t0__2t0_a0
+LSYM(x111)	t0__9a0		! t0__4t0_a0	! b_e_t0	! t0__3t0
+LSYM(x112)	t0__3a0		! t0__2t0_a0	! b_e_t0	! t0__16t0
+LSYM(x113)	t0__9a0		! t0__4t0_a0	! b_e_t02a0	! t0__3t0
+LSYM(x114)	t0__9a0		! t0__2t0_a0	! b_e_2t0	! t0__3t0
+LSYM(x115)	t0__9a0		! t0__2t0_a0	! b_e_2t0a0	! t0__3t0
+LSYM(x116)	t0__3a0		! t0__2t0_a0	! b_e_4t0	! t0__4t0_a0
+LSYM(x117)	t0__3a0		! t0__4t0_a0	! b_e_t0	! t0__9t0
+LSYM(x118)	t0__3a0		! t0__4t0_a0	! b_e_t0a0	! t0__9t0
+LSYM(x119)	t0__3a0		! t0__4t0_a0	! b_e_t02a0	! t0__9t0
+LSYM(x120)	t0__5a0		! t0__3t0	! b_e_shift	! r__r_8t0
+LSYM(x121)	t0__5a0		! t0__3t0	! b_e_t0	! t0__8t0_a0
+LSYM(x122)	t0__5a0		! t0__3t0	! b_e_2t0	! t0__4t0_a0
+LSYM(x123)	t0__5a0		! t0__8t0_a0	! b_e_t0	! t0__3t0
+LSYM(x124)	t0__32a0	! t0__t0ma0	! b_e_shift	! r__r_4t0
+LSYM(x125)	t0__5a0		! t0__5t0	! b_e_t0	! t0__5t0
+LSYM(x126)	t0__64a0	! t0__t0ma0	! b_e_shift	! r__r_2t0
+LSYM(x127)	t0__128a0	! a1_ne_0_b_l0	! t0__t0ma0	! b_n_ret_t0
+LSYM(x128)	t0__128a0	! a1_ne_0_b_l1	! r__r_t0	! MILLIRETN
+LSYM(x129)	t0__128a0	! a1_ne_0_b_l0	! t0__t0_a0	! b_n_ret_t0
+LSYM(x130)	t0__64a0	! t0__t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x131)	t0__8a0		! t0__8t0_a0	! b_e_t0	! t0__2t0_a0
+LSYM(x132)	t0__8a0		! t0__4t0_a0	! b_e_shift	! r__r_4t0
+LSYM(x133)	t0__8a0		! t0__4t0_a0	! b_e_t0	! t0__4t0_a0
+LSYM(x134)	t0__8a0		! t0__4t0_a0	! b_e_2t0	! t0__2t0_a0
+LSYM(x135)	t0__9a0		! t0__5t0	! b_e_t0	! t0__3t0
+LSYM(x136)	t0__8a0		! t0__2t0_a0	! b_e_shift	! r__r_8t0
+LSYM(x137)	t0__8a0		! t0__2t0_a0	! b_e_t0	! t0__8t0_a0
+LSYM(x138)	t0__8a0		! t0__2t0_a0	! b_e_2t0	! t0__4t0_a0
+LSYM(x139)	t0__8a0		! t0__2t0_a0	! b_e_2t0a0	! t0__4t0_a0
+LSYM(x140)	t0__3a0		! t0__2t0_a0	! b_e_4t0	! t0__5t0
+LSYM(x141)	t0__8a0		! t0__2t0_a0	! b_e_4t0a0	! t0__2t0_a0
+LSYM(x142)	t0__9a0		! t0__8t0	! b_e_2t0	! t0__t0ma0
+LSYM(x143)	t0__16a0	! t0__9t0	! b_e_t0	! t0__t0ma0
+LSYM(x144)	t0__9a0		! t0__8t0	! b_e_shift	! r__r_2t0
+LSYM(x145)	t0__9a0		! t0__8t0	! b_e_t0	! t0__2t0_a0
+LSYM(x146)	t0__9a0		! t0__8t0_a0	! b_e_shift	! r__r_2t0
+LSYM(x147)	t0__9a0		! t0__8t0_a0	! b_e_t0	! t0__2t0_a0
+LSYM(x148)	t0__9a0		! t0__4t0_a0	! b_e_shift	! r__r_4t0
+LSYM(x149)	t0__9a0		! t0__4t0_a0	! b_e_t0	! t0__4t0_a0
+LSYM(x150)	t0__9a0		! t0__4t0_a0	! b_e_2t0	! t0__2t0_a0
+LSYM(x151)	t0__9a0		! t0__4t0_a0	! b_e_2t0a0	! t0__2t0_a0
+LSYM(x152)	t0__9a0		! t0__2t0_a0	! b_e_shift	! r__r_8t0
+LSYM(x153)	t0__9a0		! t0__2t0_a0	! b_e_t0	! t0__8t0_a0
+LSYM(x154)	t0__9a0		! t0__2t0_a0	! b_e_2t0	! t0__4t0_a0
+LSYM(x155)	t0__32a0	! t0__t0ma0	! b_e_t0	! t0__5t0
+LSYM(x156)	t0__9a0		! t0__2t0_a0	! b_e_4t0	! t0__2t0_a0
+LSYM(x157)	t0__32a0	! t0__t0ma0	! b_e_t02a0	! t0__5t0
+LSYM(x158)	t0__16a0	! t0__5t0	! b_e_2t0	! t0__t0ma0
+LSYM(x159)	t0__32a0	! t0__5t0	! b_e_t0	! t0__t0ma0
+LSYM(x160)	t0__5a0		! t0__4t0	! b_e_shift	! r__r_8t0
+LSYM(x161)	t0__8a0		! t0__5t0	! b_e_t0	! t0__4t0_a0
+LSYM(x162)	t0__9a0		! t0__9t0	! b_e_shift	! r__r_2t0
+LSYM(x163)	t0__9a0		! t0__9t0	! b_e_t0	! t0__2t0_a0
+LSYM(x164)	t0__5a0		! t0__8t0_a0	! b_e_shift	! r__r_4t0
+LSYM(x165)	t0__8a0		! t0__4t0_a0	! b_e_t0	! t0__5t0
+LSYM(x166)	t0__5a0		! t0__8t0_a0	! b_e_2t0	! t0__2t0_a0
+LSYM(x167)	t0__5a0		! t0__8t0_a0	! b_e_2t0a0	! t0__2t0_a0
+LSYM(x168)	t0__5a0		! t0__4t0_a0	! b_e_shift	! r__r_8t0
+LSYM(x169)	t0__5a0		! t0__4t0_a0	! b_e_t0	! t0__8t0_a0
+LSYM(x170)	t0__32a0	! t0__t0_2a0	! b_e_t0	! t0__5t0
+LSYM(x171)	t0__9a0		! t0__2t0_a0	! b_e_t0	! t0__9t0
+LSYM(x172)	t0__5a0		! t0__4t0_a0	! b_e_4t0	! t0__2t0_a0
+LSYM(x173)	t0__9a0		! t0__2t0_a0	! b_e_t02a0	! t0__9t0
+LSYM(x174)	t0__32a0	! t0__t0_2a0	! b_e_t04a0	! t0__5t0
+LSYM(x175)	t0__8a0		! t0__2t0_a0	! b_e_5t0	! t0__2t0_a0
+LSYM(x176)	t0__5a0		! t0__4t0_a0	! b_e_8t0	! t0__t0_a0
+LSYM(x177)	t0__5a0		! t0__4t0_a0	! b_e_8t0a0	! t0__t0_a0
+LSYM(x178)	t0__5a0		! t0__2t0_a0	! b_e_2t0	! t0__8t0_a0
+LSYM(x179)	t0__5a0		! t0__2t0_a0	! b_e_2t0a0	! t0__8t0_a0
+LSYM(x180)	t0__9a0		! t0__5t0	! b_e_shift	! r__r_4t0
+LSYM(x181)	t0__9a0		! t0__5t0	! b_e_t0	! t0__4t0_a0
+LSYM(x182)	t0__9a0		! t0__5t0	! b_e_2t0	! t0__2t0_a0
+LSYM(x183)	t0__9a0		! t0__5t0	! b_e_2t0a0	! t0__2t0_a0
+LSYM(x184)	t0__5a0		! t0__9t0	! b_e_4t0	! t0__t0_a0
+LSYM(x185)	t0__9a0		! t0__4t0_a0	! b_e_t0	! t0__5t0
+LSYM(x186)	t0__32a0	! t0__t0ma0	! b_e_2t0	! t0__3t0
+LSYM(x187)	t0__9a0		! t0__4t0_a0	! b_e_t02a0	! t0__5t0
+LSYM(x188)	t0__9a0		! t0__5t0	! b_e_4t0	! t0__t0_2a0
+LSYM(x189)	t0__5a0		! t0__4t0_a0	! b_e_t0	! t0__9t0
+LSYM(x190)	t0__9a0		! t0__2t0_a0	! b_e_2t0	! t0__5t0
+LSYM(x191)	t0__64a0	! t0__3t0	! b_e_t0	! t0__t0ma0
+LSYM(x192)	t0__8a0		! t0__3t0	! b_e_shift	! r__r_8t0
+LSYM(x193)	t0__8a0		! t0__3t0	! b_e_t0	! t0__8t0_a0
+LSYM(x194)	t0__8a0		! t0__3t0	! b_e_2t0	! t0__4t0_a0
+LSYM(x195)	t0__8a0		! t0__8t0_a0	! b_e_t0	! t0__3t0
+LSYM(x196)	t0__8a0		! t0__3t0	! b_e_4t0	! t0__2t0_a0
+LSYM(x197)	t0__8a0		! t0__3t0	! b_e_4t0a0	! t0__2t0_a0
+LSYM(x198)	t0__64a0	! t0__t0_2a0	! b_e_t0	! t0__3t0
+LSYM(x199)	t0__8a0		! t0__4t0_a0	! b_e_2t0a0	! t0__3t0
+LSYM(x200)	t0__5a0		! t0__5t0	! b_e_shift	! r__r_8t0
+LSYM(x201)	t0__5a0		! t0__5t0	! b_e_t0	! t0__8t0_a0
+LSYM(x202)	t0__5a0		! t0__5t0	! b_e_2t0	! t0__4t0_a0
+LSYM(x203)	t0__5a0		! t0__5t0	! b_e_2t0a0	! t0__4t0_a0
+LSYM(x204)	t0__8a0		! t0__2t0_a0	! b_e_4t0	! t0__3t0
+LSYM(x205)	t0__5a0		! t0__8t0_a0	! b_e_t0	! t0__5t0
+LSYM(x206)	t0__64a0	! t0__t0_4a0	! b_e_t02a0	! t0__3t0
+LSYM(x207)	t0__8a0		! t0__2t0_a0	! b_e_3t0	! t0__4t0_a0
+LSYM(x208)	t0__5a0		! t0__5t0	! b_e_8t0	! t0__t0_a0
+LSYM(x209)	t0__5a0		! t0__5t0	! b_e_8t0a0	! t0__t0_a0
+LSYM(x210)	t0__5a0		! t0__4t0_a0	! b_e_2t0	! t0__5t0
+LSYM(x211)	t0__5a0		! t0__4t0_a0	! b_e_2t0a0	! t0__5t0
+LSYM(x212)	t0__3a0		! t0__4t0_a0	! b_e_4t0	! t0__4t0_a0
+LSYM(x213)	t0__3a0		! t0__4t0_a0	! b_e_4t0a0	! t0__4t0_a0
+LSYM(x214)	t0__9a0		! t0__t0_4a0	! b_e_2t04a0	! t0__8t0_a0
+LSYM(x215)	t0__5a0		! t0__4t0_a0	! b_e_5t0	! t0__2t0_a0
+LSYM(x216)	t0__9a0		! t0__3t0	! b_e_shift	! r__r_8t0
+LSYM(x217)	t0__9a0		! t0__3t0	! b_e_t0	! t0__8t0_a0
+LSYM(x218)	t0__9a0		! t0__3t0	! b_e_2t0	! t0__4t0_a0
+LSYM(x219)	t0__9a0		! t0__8t0_a0	! b_e_t0	! t0__3t0
+LSYM(x220)	t0__3a0		! t0__9t0	! b_e_4t0	! t0__2t0_a0
+LSYM(x221)	t0__3a0		! t0__9t0	! b_e_4t0a0	! t0__2t0_a0
+LSYM(x222)	t0__9a0		! t0__4t0_a0	! b_e_2t0	! t0__3t0
+LSYM(x223)	t0__9a0		! t0__4t0_a0	! b_e_2t0a0	! t0__3t0
+LSYM(x224)	t0__9a0		! t0__3t0	! b_e_8t0	! t0__t0_a0
+LSYM(x225)	t0__9a0		! t0__5t0	! b_e_t0	! t0__5t0
+LSYM(x226)	t0__3a0		! t0__2t0_a0	! b_e_t02a0	! t0__32t0
+LSYM(x227)	t0__9a0		! t0__5t0	! b_e_t02a0	! t0__5t0
+LSYM(x228)	t0__9a0		! t0__2t0_a0	! b_e_4t0	! t0__3t0
+LSYM(x229)	t0__9a0		! t0__2t0_a0	! b_e_4t0a0	! t0__3t0
+LSYM(x230)	t0__9a0		! t0__5t0	! b_e_5t0	! t0__t0_a0
+LSYM(x231)	t0__9a0		! t0__2t0_a0	! b_e_3t0	! t0__4t0_a0
+LSYM(x232)	t0__3a0		! t0__2t0_a0	! b_e_8t0	! t0__4t0_a0
+LSYM(x233)	t0__3a0		! t0__2t0_a0	! b_e_8t0a0	! t0__4t0_a0
+LSYM(x234)	t0__3a0		! t0__4t0_a0	! b_e_2t0	! t0__9t0
+LSYM(x235)	t0__3a0		! t0__4t0_a0	! b_e_2t0a0	! t0__9t0
+LSYM(x236)	t0__9a0		! t0__2t0_a0	! b_e_4t08a0	! t0__3t0
+LSYM(x237)	t0__16a0	! t0__5t0	! b_e_3t0	! t0__t0ma0
+LSYM(x238)	t0__3a0		! t0__4t0_a0	! b_e_2t04a0	! t0__9t0
+LSYM(x239)	t0__16a0	! t0__5t0	! b_e_t0ma0	! t0__3t0
+LSYM(x240)	t0__9a0		! t0__t0_a0	! b_e_8t0	! t0__3t0
+LSYM(x241)	t0__9a0		! t0__t0_a0	! b_e_8t0a0	! t0__3t0
+LSYM(x242)	t0__5a0		! t0__3t0	! b_e_2t0	! t0__8t0_a0
+LSYM(x243)	t0__9a0		! t0__9t0	! b_e_t0	! t0__3t0
+LSYM(x244)	t0__5a0		! t0__3t0	! b_e_4t0	! t0__4t0_a0
+LSYM(x245)	t0__8a0		! t0__3t0	! b_e_5t0	! t0__2t0_a0
+LSYM(x246)	t0__5a0		! t0__8t0_a0	! b_e_2t0	! t0__3t0
+LSYM(x247)	t0__5a0		! t0__8t0_a0	! b_e_2t0a0	! t0__3t0
+LSYM(x248)	t0__32a0	! t0__t0ma0	! b_e_shift	! r__r_8t0
+LSYM(x249)	t0__32a0	! t0__t0ma0	! b_e_t0	! t0__8t0_a0
+LSYM(x250)	t0__5a0		! t0__5t0	! b_e_2t0	! t0__5t0
+LSYM(x251)	t0__5a0		! t0__5t0	! b_e_2t0a0	! t0__5t0
+LSYM(x252)	t0__64a0	! t0__t0ma0	! b_e_shift	! r__r_4t0
+LSYM(x253)	t0__64a0	! t0__t0ma0	! b_e_t0	! t0__4t0_a0
+LSYM(x254)	t0__128a0	! t0__t0ma0	! b_e_shift	! r__r_2t0
+LSYM(x255)	t0__256a0	! a1_ne_0_b_l0	! t0__t0ma0	! b_n_ret_t0
+/*1040 insts before this.  */
+LSYM(ret_t0)	MILLIRET
+LSYM(e_t0)	r__r_t0
+LSYM(e_shift)	a1_ne_0_b_l2
+	a0__256a0	/* a0 <<= 8 *********** */
+	MILLIRETN
+LSYM(e_t0ma0)	a1_ne_0_b_l0
+	t0__t0ma0
+	MILLIRET
+	r__r_t0
+LSYM(e_t0a0)	a1_ne_0_b_l0
+	t0__t0_a0
+	MILLIRET
+	r__r_t0
+LSYM(e_t02a0)	a1_ne_0_b_l0
+	t0__t0_2a0
+	MILLIRET
+	r__r_t0
+LSYM(e_t04a0)	a1_ne_0_b_l0
+	t0__t0_4a0
+	MILLIRET
+	r__r_t0
+LSYM(e_2t0)	a1_ne_0_b_l1
+	r__r_2t0
+	MILLIRETN
+LSYM(e_2t0a0)	a1_ne_0_b_l0
+	t0__2t0_a0
+	MILLIRET
+	r__r_t0
+LSYM(e2t04a0)	t0__t0_2a0
+	a1_ne_0_b_l1
+	r__r_2t0
+	MILLIRETN
+LSYM(e_3t0)	a1_ne_0_b_l0
+	t0__3t0
+	MILLIRET
+	r__r_t0
+LSYM(e_4t0)	a1_ne_0_b_l1
+	r__r_4t0
+	MILLIRETN
+LSYM(e_4t0a0)	a1_ne_0_b_l0
+	t0__4t0_a0
+	MILLIRET
+	r__r_t0
+LSYM(e4t08a0)	t0__t0_2a0
+	a1_ne_0_b_l1
+	r__r_4t0
+	MILLIRETN
+LSYM(e_5t0)	a1_ne_0_b_l0
+	t0__5t0
+	MILLIRET
+	r__r_t0
+LSYM(e_8t0)	a1_ne_0_b_l1
+	r__r_8t0
+	MILLIRETN
+LSYM(e_8t0a0)	a1_ne_0_b_l0
+	t0__8t0_a0
+	MILLIRET
+	r__r_t0
+
+	.procend
+	.end
+#endif
diff --git a/libgcc/config/pa/t-linux b/libgcc/config/pa/t-linux
new file mode 100644
index 00000000000..d396bf7705a
--- /dev/null
+++ b/libgcc/config/pa/t-linux
@@ -0,0 +1,6 @@
+#Plug millicode routines into libgcc.a  We want these on both native and
+#cross compiles.  We use the "64-bit" routines because the "32-bit" code
+#is broken for certain corner cases.
+
+LIB1ASMSRC = pa/milli64.S
+LIB1ASMFUNCS = _divI _divU _remI _remU _div_const _mulI _dyncall
diff --git a/libgcc/config/pa/t-linux64 b/libgcc/config/pa/t-linux64
new file mode 100644
index 00000000000..6cb9806ff2e
--- /dev/null
+++ b/libgcc/config/pa/t-linux64
@@ -0,0 +1,4 @@
+# Plug millicode routines into libgcc.a  We want these on both native and
+# cross compiles.
+# FIXME: Explain.
+LIB1ASMFUNCS := $(filter-out _dyncall, $(LIB1ASMFUNCS))
diff --git a/libgcc/config/picochip/lib1funcs.S b/libgcc/config/picochip/lib1funcs.S
new file mode 100644
index 00000000000..d344170d248
--- /dev/null
+++ b/libgcc/config/picochip/lib1funcs.S
@@ -0,0 +1,4 @@
+// picoChip ASM file
+// Fake libgcc asm file. This contains nothing, but is used to prevent gcc
+// getting upset about the lack of a lib1funcs.S file when LIB1ASMFUNCS is
+// defined to switch off the compilation of parts of libgcc.
diff --git a/libgcc/config/picochip/t-picochip b/libgcc/config/picochip/t-picochip
index 5135d500cbb..a596ec98947 100644
--- a/libgcc/config/picochip/t-picochip
+++ b/libgcc/config/picochip/t-picochip
@@ -1,2 +1,9 @@
+# Prevent some of the more complicated libgcc functions from being
+# compiled.  This is because they are generally too big to fit into an
+# AE anyway, so there is no point in having them.  Also, some don't
+# compile properly so we'll ignore them for the moment.
+LIB1ASMSRC = picochip/lib1funcs.S
+LIB1ASMFUNCS = _mulsc3 _divsc3
+
 # Turn off the building of exception handling libraries.
 LIB2ADDEH =
diff --git a/libgcc/config/sh/lib1funcs.S b/libgcc/config/sh/lib1funcs.S
new file mode 100644
index 00000000000..2f0ca16cd91
--- /dev/null
+++ b/libgcc/config/sh/lib1funcs.S
@@ -0,0 +1,3933 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006, 2009
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+!! libgcc routines for the Renesas / SuperH SH CPUs.
+!! Contributed by Steve Chamberlain.
+!! sac@cygnus.com
+
+!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
+!! recoded in assembly by Toshiyasu Morita
+!! tm@netcom.com
+
+#if defined(__ELF__) && defined(__linux__)
+.section .note.GNU-stack,"",%progbits
+.previous
+#endif
+
+/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
+   ELF local label prefixes by J"orn Rennecke
+   amylaar@cygnus.com  */
+
+#include "lib1funcs.h"
+
+/* t-vxworks needs to build both PIC and non-PIC versions of libgcc,
+   so it is more convenient to define NO_FPSCR_VALUES here than to
+   define it on the command line.  */
+#if defined __vxworks && defined __PIC__
+#define NO_FPSCR_VALUES
+#endif
+	
+#if ! __SH5__
+#ifdef L_ashiftrt
+	.global	GLOBAL(ashiftrt_r4_0)
+	.global	GLOBAL(ashiftrt_r4_1)
+	.global	GLOBAL(ashiftrt_r4_2)
+	.global	GLOBAL(ashiftrt_r4_3)
+	.global	GLOBAL(ashiftrt_r4_4)
+	.global	GLOBAL(ashiftrt_r4_5)
+	.global	GLOBAL(ashiftrt_r4_6)
+	.global	GLOBAL(ashiftrt_r4_7)
+	.global	GLOBAL(ashiftrt_r4_8)
+	.global	GLOBAL(ashiftrt_r4_9)
+	.global	GLOBAL(ashiftrt_r4_10)
+	.global	GLOBAL(ashiftrt_r4_11)
+	.global	GLOBAL(ashiftrt_r4_12)
+	.global	GLOBAL(ashiftrt_r4_13)
+	.global	GLOBAL(ashiftrt_r4_14)
+	.global	GLOBAL(ashiftrt_r4_15)
+	.global	GLOBAL(ashiftrt_r4_16)
+	.global	GLOBAL(ashiftrt_r4_17)
+	.global	GLOBAL(ashiftrt_r4_18)
+	.global	GLOBAL(ashiftrt_r4_19)
+	.global	GLOBAL(ashiftrt_r4_20)
+	.global	GLOBAL(ashiftrt_r4_21)
+	.global	GLOBAL(ashiftrt_r4_22)
+	.global	GLOBAL(ashiftrt_r4_23)
+	.global	GLOBAL(ashiftrt_r4_24)
+	.global	GLOBAL(ashiftrt_r4_25)
+	.global	GLOBAL(ashiftrt_r4_26)
+	.global	GLOBAL(ashiftrt_r4_27)
+	.global	GLOBAL(ashiftrt_r4_28)
+	.global	GLOBAL(ashiftrt_r4_29)
+	.global	GLOBAL(ashiftrt_r4_30)
+	.global	GLOBAL(ashiftrt_r4_31)
+	.global	GLOBAL(ashiftrt_r4_32)
+
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_0))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_1))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_2))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_3))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_4))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_5))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_6))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_7))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_8))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_9))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_10))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_11))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_12))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_13))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_14))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_15))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_16))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_17))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_18))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_19))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_20))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_21))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_22))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_23))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_24))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_25))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_26))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_27))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_28))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_29))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_30))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_31))
+	HIDDEN_FUNC(GLOBAL(ashiftrt_r4_32))
+
+	.align	1
+GLOBAL(ashiftrt_r4_32):
+GLOBAL(ashiftrt_r4_31):
+	rotcl	r4
+	rts
+	subc	r4,r4
+
+GLOBAL(ashiftrt_r4_30):
+	shar	r4
+GLOBAL(ashiftrt_r4_29):
+	shar	r4
+GLOBAL(ashiftrt_r4_28):
+	shar	r4
+GLOBAL(ashiftrt_r4_27):
+	shar	r4
+GLOBAL(ashiftrt_r4_26):
+	shar	r4
+GLOBAL(ashiftrt_r4_25):
+	shar	r4
+GLOBAL(ashiftrt_r4_24):
+	shlr16	r4
+	shlr8	r4
+	rts
+	exts.b	r4,r4
+
+GLOBAL(ashiftrt_r4_23):
+	shar	r4
+GLOBAL(ashiftrt_r4_22):
+	shar	r4
+GLOBAL(ashiftrt_r4_21):
+	shar	r4
+GLOBAL(ashiftrt_r4_20):
+	shar	r4
+GLOBAL(ashiftrt_r4_19):
+	shar	r4
+GLOBAL(ashiftrt_r4_18):
+	shar	r4
+GLOBAL(ashiftrt_r4_17):
+	shar	r4
+GLOBAL(ashiftrt_r4_16):
+	shlr16	r4
+	rts
+	exts.w	r4,r4
+
+GLOBAL(ashiftrt_r4_15):
+	shar	r4
+GLOBAL(ashiftrt_r4_14):
+	shar	r4
+GLOBAL(ashiftrt_r4_13):
+	shar	r4
+GLOBAL(ashiftrt_r4_12):
+	shar	r4
+GLOBAL(ashiftrt_r4_11):
+	shar	r4
+GLOBAL(ashiftrt_r4_10):
+	shar	r4
+GLOBAL(ashiftrt_r4_9):
+	shar	r4
+GLOBAL(ashiftrt_r4_8):
+	shar	r4
+GLOBAL(ashiftrt_r4_7):
+	shar	r4
+GLOBAL(ashiftrt_r4_6):
+	shar	r4
+GLOBAL(ashiftrt_r4_5):
+	shar	r4
+GLOBAL(ashiftrt_r4_4):
+	shar	r4
+GLOBAL(ashiftrt_r4_3):
+	shar	r4
+GLOBAL(ashiftrt_r4_2):
+	shar	r4
+GLOBAL(ashiftrt_r4_1):
+	rts
+	shar	r4
+
+GLOBAL(ashiftrt_r4_0):
+	rts
+	nop
+
+	ENDFUNC(GLOBAL(ashiftrt_r4_0))
+	ENDFUNC(GLOBAL(ashiftrt_r4_1))
+	ENDFUNC(GLOBAL(ashiftrt_r4_2))
+	ENDFUNC(GLOBAL(ashiftrt_r4_3))
+	ENDFUNC(GLOBAL(ashiftrt_r4_4))
+	ENDFUNC(GLOBAL(ashiftrt_r4_5))
+	ENDFUNC(GLOBAL(ashiftrt_r4_6))
+	ENDFUNC(GLOBAL(ashiftrt_r4_7))
+	ENDFUNC(GLOBAL(ashiftrt_r4_8))
+	ENDFUNC(GLOBAL(ashiftrt_r4_9))
+	ENDFUNC(GLOBAL(ashiftrt_r4_10))
+	ENDFUNC(GLOBAL(ashiftrt_r4_11))
+	ENDFUNC(GLOBAL(ashiftrt_r4_12))
+	ENDFUNC(GLOBAL(ashiftrt_r4_13))
+	ENDFUNC(GLOBAL(ashiftrt_r4_14))
+	ENDFUNC(GLOBAL(ashiftrt_r4_15))
+	ENDFUNC(GLOBAL(ashiftrt_r4_16))
+	ENDFUNC(GLOBAL(ashiftrt_r4_17))
+	ENDFUNC(GLOBAL(ashiftrt_r4_18))
+	ENDFUNC(GLOBAL(ashiftrt_r4_19))
+	ENDFUNC(GLOBAL(ashiftrt_r4_20))
+	ENDFUNC(GLOBAL(ashiftrt_r4_21))
+	ENDFUNC(GLOBAL(ashiftrt_r4_22))
+	ENDFUNC(GLOBAL(ashiftrt_r4_23))
+	ENDFUNC(GLOBAL(ashiftrt_r4_24))
+	ENDFUNC(GLOBAL(ashiftrt_r4_25))
+	ENDFUNC(GLOBAL(ashiftrt_r4_26))
+	ENDFUNC(GLOBAL(ashiftrt_r4_27))
+	ENDFUNC(GLOBAL(ashiftrt_r4_28))
+	ENDFUNC(GLOBAL(ashiftrt_r4_29))
+	ENDFUNC(GLOBAL(ashiftrt_r4_30))
+	ENDFUNC(GLOBAL(ashiftrt_r4_31))
+	ENDFUNC(GLOBAL(ashiftrt_r4_32))
+#endif
+
+#ifdef L_ashiftrt_n
+
+!
+! GLOBAL(ashrsi3)
+!
+! Entry:
+!
+! r4: Value to shift
+! r5: Shifts
+!
+! Exit:
+!
+! r0: Result
+!
+! Destroys:
+!
+! (none)
+!
+
+	.global	GLOBAL(ashrsi3)
+	HIDDEN_FUNC(GLOBAL(ashrsi3))
+	.align	2
+GLOBAL(ashrsi3):
+	mov	#31,r0
+	and	r0,r5
+	mova	LOCAL(ashrsi3_table),r0
+	mov.b	@(r0,r5),r5
+#ifdef __sh1__
+	add	r5,r0
+	jmp	@r0
+#else
+	braf	r5
+#endif
+	mov	r4,r0
+
+	.align	2
+LOCAL(ashrsi3_table):
+	.byte		LOCAL(ashrsi3_0)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_1)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_2)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_3)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_4)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_5)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_6)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_7)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_8)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_9)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_10)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_11)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_12)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_13)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_14)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_15)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_16)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_17)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_18)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_19)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_20)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_21)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_22)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_23)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_24)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_25)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_26)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_27)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_28)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_29)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_30)-LOCAL(ashrsi3_table)
+	.byte		LOCAL(ashrsi3_31)-LOCAL(ashrsi3_table)
+
+LOCAL(ashrsi3_31):
+	rotcl	r0
+	rts
+	subc	r0,r0
+
+LOCAL(ashrsi3_30):
+	shar	r0
+LOCAL(ashrsi3_29):
+	shar	r0
+LOCAL(ashrsi3_28):
+	shar	r0
+LOCAL(ashrsi3_27):
+	shar	r0
+LOCAL(ashrsi3_26):
+	shar	r0
+LOCAL(ashrsi3_25):
+	shar	r0
+LOCAL(ashrsi3_24):
+	shlr16	r0
+	shlr8	r0
+	rts
+	exts.b	r0,r0
+
+LOCAL(ashrsi3_23):
+	shar	r0
+LOCAL(ashrsi3_22):
+	shar	r0
+LOCAL(ashrsi3_21):
+	shar	r0
+LOCAL(ashrsi3_20):
+	shar	r0
+LOCAL(ashrsi3_19):
+	shar	r0
+LOCAL(ashrsi3_18):
+	shar	r0
+LOCAL(ashrsi3_17):
+	shar	r0
+LOCAL(ashrsi3_16):
+	shlr16	r0
+	rts
+	exts.w	r0,r0
+
+LOCAL(ashrsi3_15):
+	shar	r0
+LOCAL(ashrsi3_14):
+	shar	r0
+LOCAL(ashrsi3_13):
+	shar	r0
+LOCAL(ashrsi3_12):
+	shar	r0
+LOCAL(ashrsi3_11):
+	shar	r0
+LOCAL(ashrsi3_10):
+	shar	r0
+LOCAL(ashrsi3_9):
+	shar	r0
+LOCAL(ashrsi3_8):
+	shar	r0
+LOCAL(ashrsi3_7):
+	shar	r0
+LOCAL(ashrsi3_6):
+	shar	r0
+LOCAL(ashrsi3_5):
+	shar	r0
+LOCAL(ashrsi3_4):
+	shar	r0
+LOCAL(ashrsi3_3):
+	shar	r0
+LOCAL(ashrsi3_2):
+	shar	r0
+LOCAL(ashrsi3_1):
+	rts
+	shar	r0
+
+LOCAL(ashrsi3_0):
+	rts
+	nop
+
+	ENDFUNC(GLOBAL(ashrsi3))
+#endif
+
+#ifdef L_ashiftlt
+
+!
+! GLOBAL(ashlsi3)
+!
+! Entry:
+!
+! r4: Value to shift
+! r5: Shifts
+!
+! Exit:
+!
+! r0: Result
+!
+! Destroys:
+!
+! (none)
+!
+	.global	GLOBAL(ashlsi3)
+	HIDDEN_FUNC(GLOBAL(ashlsi3))
+	.align	2
+GLOBAL(ashlsi3):
+	mov	#31,r0
+	and	r0,r5
+	mova	LOCAL(ashlsi3_table),r0
+	mov.b	@(r0,r5),r5
+#ifdef __sh1__
+	add	r5,r0
+	jmp	@r0
+#else
+	braf	r5
+#endif
+	mov	r4,r0
+
+	.align	2
+LOCAL(ashlsi3_table):
+	.byte		LOCAL(ashlsi3_0)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_1)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_2)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_3)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_4)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_5)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_6)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_7)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_8)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_9)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_10)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_11)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_12)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_13)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_14)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_15)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_16)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_17)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_18)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_19)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_20)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_21)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_22)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_23)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_24)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_25)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_26)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_27)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_28)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_29)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_30)-LOCAL(ashlsi3_table)
+	.byte		LOCAL(ashlsi3_31)-LOCAL(ashlsi3_table)
+
+LOCAL(ashlsi3_6):
+	shll2	r0
+LOCAL(ashlsi3_4):
+	shll2	r0
+LOCAL(ashlsi3_2):
+	rts
+	shll2	r0
+
+LOCAL(ashlsi3_7):
+	shll2	r0
+LOCAL(ashlsi3_5):
+	shll2	r0
+LOCAL(ashlsi3_3):
+	shll2	r0
+LOCAL(ashlsi3_1):
+	rts
+	shll	r0
+
+LOCAL(ashlsi3_14):
+	shll2	r0
+LOCAL(ashlsi3_12):
+	shll2	r0
+LOCAL(ashlsi3_10):
+	shll2	r0
+LOCAL(ashlsi3_8):
+	rts
+	shll8	r0
+
+LOCAL(ashlsi3_15):
+	shll2	r0
+LOCAL(ashlsi3_13):
+	shll2	r0
+LOCAL(ashlsi3_11):
+	shll2	r0
+LOCAL(ashlsi3_9):
+	shll8	r0
+	rts
+	shll	r0
+
+LOCAL(ashlsi3_22):
+	shll2	r0
+LOCAL(ashlsi3_20):
+	shll2	r0
+LOCAL(ashlsi3_18):
+	shll2	r0
+LOCAL(ashlsi3_16):
+	rts
+	shll16	r0
+
+LOCAL(ashlsi3_23):
+	shll2	r0
+LOCAL(ashlsi3_21):
+	shll2	r0
+LOCAL(ashlsi3_19):
+	shll2	r0
+LOCAL(ashlsi3_17):
+	shll16	r0
+	rts
+	shll	r0
+
+LOCAL(ashlsi3_30):
+	shll2	r0
+LOCAL(ashlsi3_28):
+	shll2	r0
+LOCAL(ashlsi3_26):
+	shll2	r0
+LOCAL(ashlsi3_24):
+	shll16	r0
+	rts
+	shll8	r0
+
+LOCAL(ashlsi3_31):
+	shll2	r0
+LOCAL(ashlsi3_29):
+	shll2	r0
+LOCAL(ashlsi3_27):
+	shll2	r0
+LOCAL(ashlsi3_25):
+	shll16	r0
+	shll8	r0
+	rts
+	shll	r0
+
+LOCAL(ashlsi3_0):
+	rts
+	nop
+
+	ENDFUNC(GLOBAL(ashlsi3))
+#endif
+
+#ifdef L_lshiftrt
+
+!
+! GLOBAL(lshrsi3)
+!
+! Entry:
+!
+! r4: Value to shift
+! r5: Shifts
+!
+! Exit:
+!
+! r0: Result
+!
+! Destroys:
+!
+! (none)
+!
+	.global	GLOBAL(lshrsi3)
+	HIDDEN_FUNC(GLOBAL(lshrsi3))
+	.align	2
+GLOBAL(lshrsi3):
+	mov	#31,r0
+	and	r0,r5
+	mova	LOCAL(lshrsi3_table),r0
+	mov.b	@(r0,r5),r5
+#ifdef __sh1__
+	add	r5,r0
+	jmp	@r0
+#else
+	braf	r5
+#endif
+	mov	r4,r0
+
+	.align	2
+LOCAL(lshrsi3_table):
+	.byte		LOCAL(lshrsi3_0)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_1)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_2)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_3)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_4)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_5)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_6)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_7)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_8)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_9)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_10)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_11)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_12)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_13)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_14)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_15)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_16)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_17)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_18)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_19)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_20)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_21)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_22)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_23)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_24)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_25)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_26)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_27)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_28)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_29)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_30)-LOCAL(lshrsi3_table)
+	.byte		LOCAL(lshrsi3_31)-LOCAL(lshrsi3_table)
+
+LOCAL(lshrsi3_6):
+	shlr2	r0
+LOCAL(lshrsi3_4):
+	shlr2	r0
+LOCAL(lshrsi3_2):
+	rts
+	shlr2	r0
+
+LOCAL(lshrsi3_7):
+	shlr2	r0
+LOCAL(lshrsi3_5):
+	shlr2	r0
+LOCAL(lshrsi3_3):
+	shlr2	r0
+LOCAL(lshrsi3_1):
+	rts
+	shlr	r0
+
+LOCAL(lshrsi3_14):
+	shlr2	r0
+LOCAL(lshrsi3_12):
+	shlr2	r0
+LOCAL(lshrsi3_10):
+	shlr2	r0
+LOCAL(lshrsi3_8):
+	rts
+	shlr8	r0
+
+LOCAL(lshrsi3_15):
+	shlr2	r0
+LOCAL(lshrsi3_13):
+	shlr2	r0
+LOCAL(lshrsi3_11):
+	shlr2	r0
+LOCAL(lshrsi3_9):
+	shlr8	r0
+	rts
+	shlr	r0
+
+LOCAL(lshrsi3_22):
+	shlr2	r0
+LOCAL(lshrsi3_20):
+	shlr2	r0
+LOCAL(lshrsi3_18):
+	shlr2	r0
+LOCAL(lshrsi3_16):
+	rts
+	shlr16	r0
+
+LOCAL(lshrsi3_23):
+	shlr2	r0
+LOCAL(lshrsi3_21):
+	shlr2	r0
+LOCAL(lshrsi3_19):
+	shlr2	r0
+LOCAL(lshrsi3_17):
+	shlr16	r0
+	rts
+	shlr	r0
+
+LOCAL(lshrsi3_30):
+	shlr2	r0
+LOCAL(lshrsi3_28):
+	shlr2	r0
+LOCAL(lshrsi3_26):
+	shlr2	r0
+LOCAL(lshrsi3_24):
+	shlr16	r0
+	rts
+	shlr8	r0
+
+LOCAL(lshrsi3_31):
+	shlr2	r0
+LOCAL(lshrsi3_29):
+	shlr2	r0
+LOCAL(lshrsi3_27):
+	shlr2	r0
+LOCAL(lshrsi3_25):
+	shlr16	r0
+	shlr8	r0
+	rts
+	shlr	r0
+
+LOCAL(lshrsi3_0):
+	rts
+	nop
+
+	ENDFUNC(GLOBAL(lshrsi3))
+#endif
+
+#ifdef L_movmem
+	.text
+	.balign	4
+	.global	GLOBAL(movmem)
+	HIDDEN_FUNC(GLOBAL(movmem))
+	HIDDEN_ALIAS(movstr,movmem)
+	/* This would be a lot simpler if r6 contained the byte count
+	   minus 64, and we wouldn't be called here for a byte count of 64.  */
+GLOBAL(movmem):
+	sts.l	pr,@-r15
+	shll2	r6
+	bsr	GLOBAL(movmemSI52+2)
+	mov.l	@(48,r5),r0
+	.balign	4
+LOCAL(movmem_loop): /* Reached with rts */
+	mov.l	@(60,r5),r0
+	add	#-64,r6
+	mov.l	r0,@(60,r4)
+	tst	r6,r6
+	mov.l	@(56,r5),r0
+	bt	LOCAL(movmem_done)
+	mov.l	r0,@(56,r4)
+	cmp/pl	r6
+	mov.l	@(52,r5),r0
+	add	#64,r5
+	mov.l	r0,@(52,r4)
+	add	#64,r4
+	bt	GLOBAL(movmemSI52)
+! done all the large groups, do the remainder
+! jump to movmem+
+	mova	GLOBAL(movmemSI4)+4,r0
+	add	r6,r0
+	jmp	@r0
+LOCAL(movmem_done): ! share slot insn, works out aligned.
+	lds.l	@r15+,pr
+	mov.l	r0,@(56,r4)
+	mov.l	@(52,r5),r0
+	rts
+	mov.l	r0,@(52,r4)
+	.balign	4
+! ??? We need aliases movstr* for movmem* for the older libraries.  These
+! aliases will be removed at the some point in the future.
+	.global	GLOBAL(movmemSI64)
+	HIDDEN_FUNC(GLOBAL(movmemSI64))
+	HIDDEN_ALIAS(movstrSI64,movmemSI64)
+GLOBAL(movmemSI64):
+	mov.l	@(60,r5),r0
+	mov.l	r0,@(60,r4)
+	.global	GLOBAL(movmemSI60)
+	HIDDEN_FUNC(GLOBAL(movmemSI60))
+	HIDDEN_ALIAS(movstrSI60,movmemSI60)
+GLOBAL(movmemSI60):
+	mov.l	@(56,r5),r0
+	mov.l	r0,@(56,r4)
+	.global	GLOBAL(movmemSI56)
+	HIDDEN_FUNC(GLOBAL(movmemSI56))
+	HIDDEN_ALIAS(movstrSI56,movmemSI56)
+GLOBAL(movmemSI56):
+	mov.l	@(52,r5),r0
+	mov.l	r0,@(52,r4)
+	.global	GLOBAL(movmemSI52)
+	HIDDEN_FUNC(GLOBAL(movmemSI52))
+	HIDDEN_ALIAS(movstrSI52,movmemSI52)
+GLOBAL(movmemSI52):
+	mov.l	@(48,r5),r0
+	mov.l	r0,@(48,r4)
+	.global	GLOBAL(movmemSI48)
+	HIDDEN_FUNC(GLOBAL(movmemSI48))
+	HIDDEN_ALIAS(movstrSI48,movmemSI48)
+GLOBAL(movmemSI48):
+	mov.l	@(44,r5),r0
+	mov.l	r0,@(44,r4)
+	.global	GLOBAL(movmemSI44)
+	HIDDEN_FUNC(GLOBAL(movmemSI44))
+	HIDDEN_ALIAS(movstrSI44,movmemSI44)
+GLOBAL(movmemSI44):
+	mov.l	@(40,r5),r0
+	mov.l	r0,@(40,r4)
+	.global	GLOBAL(movmemSI40)
+	HIDDEN_FUNC(GLOBAL(movmemSI40))
+	HIDDEN_ALIAS(movstrSI40,movmemSI40)
+GLOBAL(movmemSI40):
+	mov.l	@(36,r5),r0
+	mov.l	r0,@(36,r4)
+	.global	GLOBAL(movmemSI36)
+	HIDDEN_FUNC(GLOBAL(movmemSI36))
+	HIDDEN_ALIAS(movstrSI36,movmemSI36)
+GLOBAL(movmemSI36):
+	mov.l	@(32,r5),r0
+	mov.l	r0,@(32,r4)
+	.global	GLOBAL(movmemSI32)
+	HIDDEN_FUNC(GLOBAL(movmemSI32))
+	HIDDEN_ALIAS(movstrSI32,movmemSI32)
+GLOBAL(movmemSI32):
+	mov.l	@(28,r5),r0
+	mov.l	r0,@(28,r4)
+	.global	GLOBAL(movmemSI28)
+	HIDDEN_FUNC(GLOBAL(movmemSI28))
+	HIDDEN_ALIAS(movstrSI28,movmemSI28)
+GLOBAL(movmemSI28):
+	mov.l	@(24,r5),r0
+	mov.l	r0,@(24,r4)
+	.global	GLOBAL(movmemSI24)
+	HIDDEN_FUNC(GLOBAL(movmemSI24))
+	HIDDEN_ALIAS(movstrSI24,movmemSI24)
+GLOBAL(movmemSI24):
+	mov.l	@(20,r5),r0
+	mov.l	r0,@(20,r4)
+	.global	GLOBAL(movmemSI20)
+	HIDDEN_FUNC(GLOBAL(movmemSI20))
+	HIDDEN_ALIAS(movstrSI20,movmemSI20)
+GLOBAL(movmemSI20):
+	mov.l	@(16,r5),r0
+	mov.l	r0,@(16,r4)
+	.global	GLOBAL(movmemSI16)
+	HIDDEN_FUNC(GLOBAL(movmemSI16))
+	HIDDEN_ALIAS(movstrSI16,movmemSI16)
+GLOBAL(movmemSI16):
+	mov.l	@(12,r5),r0
+	mov.l	r0,@(12,r4)
+	.global	GLOBAL(movmemSI12)
+	HIDDEN_FUNC(GLOBAL(movmemSI12))
+	HIDDEN_ALIAS(movstrSI12,movmemSI12)
+GLOBAL(movmemSI12):
+	mov.l	@(8,r5),r0
+	mov.l	r0,@(8,r4)
+	.global	GLOBAL(movmemSI8)
+	HIDDEN_FUNC(GLOBAL(movmemSI8))
+	HIDDEN_ALIAS(movstrSI8,movmemSI8)
+GLOBAL(movmemSI8):
+	mov.l	@(4,r5),r0
+	mov.l	r0,@(4,r4)
+	.global	GLOBAL(movmemSI4)
+	HIDDEN_FUNC(GLOBAL(movmemSI4))
+	HIDDEN_ALIAS(movstrSI4,movmemSI4)
+GLOBAL(movmemSI4):
+	mov.l	@(0,r5),r0
+	rts
+	mov.l	r0,@(0,r4)
+
+	ENDFUNC(GLOBAL(movmemSI64))
+	ENDFUNC(GLOBAL(movmemSI60))
+	ENDFUNC(GLOBAL(movmemSI56))
+	ENDFUNC(GLOBAL(movmemSI52))
+	ENDFUNC(GLOBAL(movmemSI48))
+	ENDFUNC(GLOBAL(movmemSI44))
+	ENDFUNC(GLOBAL(movmemSI40))
+	ENDFUNC(GLOBAL(movmemSI36))
+	ENDFUNC(GLOBAL(movmemSI32))
+	ENDFUNC(GLOBAL(movmemSI28))
+	ENDFUNC(GLOBAL(movmemSI24))
+	ENDFUNC(GLOBAL(movmemSI20))
+	ENDFUNC(GLOBAL(movmemSI16))
+	ENDFUNC(GLOBAL(movmemSI12))
+	ENDFUNC(GLOBAL(movmemSI8))
+	ENDFUNC(GLOBAL(movmemSI4))
+	ENDFUNC(GLOBAL(movmem))
+#endif
+
+#ifdef L_movmem_i4
+	.text
+	.global	GLOBAL(movmem_i4_even)
+	.global	GLOBAL(movmem_i4_odd)
+	.global	GLOBAL(movmemSI12_i4)
+
+	HIDDEN_FUNC(GLOBAL(movmem_i4_even))
+	HIDDEN_FUNC(GLOBAL(movmem_i4_odd))
+	HIDDEN_FUNC(GLOBAL(movmemSI12_i4))
+
+	HIDDEN_ALIAS(movstr_i4_even,movmem_i4_even)
+	HIDDEN_ALIAS(movstr_i4_odd,movmem_i4_odd)
+	HIDDEN_ALIAS(movstrSI12_i4,movmemSI12_i4)
+
+	.p2align	5
+L_movmem_2mod4_end:
+	mov.l	r0,@(16,r4)
+	rts
+	mov.l	r1,@(20,r4)
+
+	.p2align	2
+
+GLOBAL(movmem_i4_even):
+	mov.l	@r5+,r0
+	bra	L_movmem_start_even
+	mov.l	@r5+,r1
+
+GLOBAL(movmem_i4_odd):
+	mov.l	@r5+,r1
+	add	#-4,r4
+	mov.l	@r5+,r2
+	mov.l	@r5+,r3
+	mov.l	r1,@(4,r4)
+	mov.l	r2,@(8,r4)
+
+L_movmem_loop:
+	mov.l	r3,@(12,r4)
+	dt	r6
+	mov.l	@r5+,r0
+	bt/s	L_movmem_2mod4_end
+	mov.l	@r5+,r1
+	add	#16,r4
+L_movmem_start_even:
+	mov.l	@r5+,r2
+	mov.l	@r5+,r3
+	mov.l	r0,@r4
+	dt	r6
+	mov.l	r1,@(4,r4)
+	bf/s	L_movmem_loop
+	mov.l	r2,@(8,r4)
+	rts
+	mov.l	r3,@(12,r4)
+
+	ENDFUNC(GLOBAL(movmem_i4_even))
+	ENDFUNC(GLOBAL(movmem_i4_odd))
+
+	.p2align	4
+GLOBAL(movmemSI12_i4):
+	mov.l	@r5,r0
+	mov.l	@(4,r5),r1
+	mov.l	@(8,r5),r2
+	mov.l	r0,@r4
+	mov.l	r1,@(4,r4)
+	rts
+	mov.l	r2,@(8,r4)
+
+	ENDFUNC(GLOBAL(movmemSI12_i4))
+#endif
+
+#ifdef L_mulsi3
+
+
+	.global	GLOBAL(mulsi3)
+	HIDDEN_FUNC(GLOBAL(mulsi3))
+
+! r4 =       aabb
+! r5 =       ccdd
+! r0 = aabb*ccdd  via partial products
+!
+! if aa == 0 and cc = 0
+! r0 = bb*dd
+!
+! else
+! aa = bb*dd + (aa*dd*65536) + (cc*bb*65536)
+!
+
+GLOBAL(mulsi3):
+	mulu.w  r4,r5		! multiply the lsws  macl=bb*dd
+	mov     r5,r3		! r3 = ccdd
+	swap.w  r4,r2		! r2 = bbaa
+	xtrct   r2,r3		! r3 = aacc
+	tst  	r3,r3		! msws zero ?
+	bf      hiset
+	rts			! yes - then we have the answer
+	sts     macl,r0
+
+hiset:	sts	macl,r0		! r0 = bb*dd
+	mulu.w	r2,r5		! brewing macl = aa*dd
+	sts	macl,r1
+	mulu.w	r3,r4		! brewing macl = cc*bb
+	sts	macl,r2
+	add	r1,r2
+	shll16	r2
+	rts
+	add	r2,r0
+
+	ENDFUNC(GLOBAL(mulsi3))
+#endif
+#endif /* ! __SH5__ */
+#ifdef L_sdivsi3_i4
+	.title "SH DIVIDE"
+!! 4 byte integer Divide code for the Renesas SH
+#ifdef __SH4__
+!! args in r4 and r5, result in fpul, clobber dr0, dr2
+
+	.global	GLOBAL(sdivsi3_i4)
+	HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
+GLOBAL(sdivsi3_i4):
+	lds r4,fpul
+	float fpul,dr0
+	lds r5,fpul
+	float fpul,dr2
+	fdiv dr2,dr0
+	rts
+	ftrc dr0,fpul
+
+	ENDFUNC(GLOBAL(sdivsi3_i4))
+#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
+!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2
+
+#if ! __SH5__ || __SH5__ == 32
+#if __SH5__
+	.mode	SHcompact
+#endif
+	.global	GLOBAL(sdivsi3_i4)
+	HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
+GLOBAL(sdivsi3_i4):
+	sts.l fpscr,@-r15
+	mov #8,r2
+	swap.w r2,r2
+	lds r2,fpscr
+	lds r4,fpul
+	float fpul,dr0
+	lds r5,fpul
+	float fpul,dr2
+	fdiv dr2,dr0
+	ftrc dr0,fpul
+	rts
+	lds.l @r15+,fpscr
+
+	ENDFUNC(GLOBAL(sdivsi3_i4))
+#endif /* ! __SH5__ || __SH5__ == 32 */
+#endif /* ! __SH4__ */
+#endif
+
+#ifdef L_sdivsi3
+/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
+   sh2e/sh3e code.  */
+#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)
+!!
+!! Steve Chamberlain
+!! sac@cygnus.com
+!!
+!!
+
+!! args in r4 and r5, result in r0 clobber r1, r2, r3, and t bit
+
+	.global	GLOBAL(sdivsi3)
+#if __SHMEDIA__
+#if __SH5__ == 32
+	.section	.text..SHmedia32,"ax"
+#else
+	.text
+#endif
+	.align	2
+#if 0
+/* The assembly code that follows is a hand-optimized version of the C
+   code that follows.  Note that the registers that are modified are
+   exactly those listed as clobbered in the patterns divsi3_i1 and
+   divsi3_i1_media.
+	
+int __sdivsi3 (i, j)
+     int i, j;
+{
+  register unsigned long long r18 asm ("r18");
+  register unsigned long long r19 asm ("r19");
+  register unsigned long long r0 asm ("r0") = 0;
+  register unsigned long long r1 asm ("r1") = 1;
+  register int r2 asm ("r2") = i >> 31;
+  register int r3 asm ("r3") = j >> 31;
+
+  r2 = r2 ? r2 : r1;
+  r3 = r3 ? r3 : r1;
+  r18 = i * r2;
+  r19 = j * r3;
+  r2 *= r3;
+  
+  r19 <<= 31;
+  r1 <<= 31;
+  do
+    if (r18 >= r19)
+      r0 |= r1, r18 -= r19;
+  while (r19 >>= 1, r1 >>= 1);
+
+  return r2 * (int)r0;
+}
+*/
+GLOBAL(sdivsi3):
+	pt/l	LOCAL(sdivsi3_dontadd), tr2
+	pt/l	LOCAL(sdivsi3_loop), tr1
+	ptabs/l	r18, tr0
+	movi	0, r0
+	movi	1, r1
+	shari.l	r4, 31, r2
+	shari.l	r5, 31, r3
+	cmveq	r2, r1, r2
+	cmveq	r3, r1, r3
+	muls.l	r4, r2, r18
+	muls.l	r5, r3, r19
+	muls.l	r2, r3, r2
+	shlli	r19, 31, r19
+	shlli	r1, 31, r1
+LOCAL(sdivsi3_loop):
+	bgtu	r19, r18, tr2
+	or	r0, r1, r0
+	sub	r18, r19, r18
+LOCAL(sdivsi3_dontadd):
+	shlri	r1, 1, r1
+	shlri	r19, 1, r19
+	bnei	r1, 0, tr1
+	muls.l	r0, r2, r0
+	add.l	r0, r63, r0
+	blink	tr0, r63
+#elif 0 /* ! 0 */
+ // inputs: r4,r5
+ // clobbered: r1,r2,r3,r18,r19,r20,r21,r25,tr0
+ // result in r0
+GLOBAL(sdivsi3):
+ // can create absolute value without extra latency,
+ // but dependent on proper sign extension of inputs:
+ // shari.l r5,31,r2
+ // xor r5,r2,r20
+ // sub r20,r2,r20 // r20 is now absolute value of r5, zero-extended.
+ shari.l r5,31,r2
+ ori r2,1,r2
+ muls.l r5,r2,r20 // r20 is now absolute value of r5, zero-extended.
+ movi 0xffffffffffffbb0c,r19 // shift count eqiv 76
+ shari.l r4,31,r3
+ nsb r20,r0
+ shlld r20,r0,r25
+ shlri r25,48,r25
+ sub r19,r25,r1
+ mmulfx.w r1,r1,r2
+ mshflo.w r1,r63,r1
+ // If r4 was to be used in-place instead of r21, could use this sequence
+ // to compute absolute:
+ // sub r63,r4,r19 // compute absolute value of r4
+ // shlri r4,32,r3 // into lower 32 bit of r4, keeping
+ // mcmv r19,r3,r4 // the sign in the upper 32 bits intact.
+ ori r3,1,r3
+ mmulfx.w r25,r2,r2
+ sub r19,r0,r0
+ muls.l r4,r3,r21
+ msub.w r1,r2,r2
+ addi r2,-2,r1
+ mulu.l r21,r1,r19
+ mmulfx.w r2,r2,r2
+ shlli r1,15,r1
+ shlrd r19,r0,r19
+ mulu.l r19,r20,r3
+ mmacnfx.wl r25,r2,r1
+ ptabs r18,tr0
+ sub r21,r3,r25
+
+ mulu.l r25,r1,r2
+ addi r0,14,r0
+ xor r4,r5,r18
+ shlrd r2,r0,r2
+ mulu.l r2,r20,r3
+ add r19,r2,r19
+ shari.l r18,31,r18
+ sub r25,r3,r25
+
+ mulu.l r25,r1,r2
+ sub r25,r20,r25
+ add r19,r18,r19
+ shlrd r2,r0,r2
+ mulu.l r2,r20,r3
+ addi r25,1,r25
+ add r19,r2,r19
+
+ cmpgt r25,r3,r25
+ add.l r19,r25,r0
+ xor r0,r18,r0
+ blink tr0,r63
+#else /* ! 0 && ! 0 */
+
+ // inputs: r4,r5
+ // clobbered: r1,r18,r19,r20,r21,r25,tr0
+ // result in r0
+	HIDDEN_FUNC(GLOBAL(sdivsi3_2))
+#ifndef __pic__
+	FUNC(GLOBAL(sdivsi3))
+GLOBAL(sdivsi3): /* this is the shcompact entry point */
+ // The special SHmedia entry point sdivsi3_1 prevents accidental linking
+ // with the SHcompact implementation, which clobbers tr1 / tr2.
+ .global GLOBAL(sdivsi3_1)
+GLOBAL(sdivsi3_1):
+ .global GLOBAL(div_table_internal)
+ movi (GLOBAL(div_table_internal) >> 16) & 65535, r20
+ shori GLOBAL(div_table_internal) & 65535, r20
+#endif
+ .global GLOBAL(sdivsi3_2)
+ // div_table in r20
+ // clobbered: r1,r18,r19,r21,r25,tr0
+GLOBAL(sdivsi3_2):
+ nsb r5, r1
+ shlld r5, r1, r25    // normalize; [-2 ..1, 1..2) in s2.62
+ shari r25, 58, r21   // extract 5(6) bit index (s2.4 with hole -1..1)
+ ldx.ub r20, r21, r19 // u0.8
+ shari r25, 32, r25   // normalize to s2.30
+ shlli r21, 1, r21
+ muls.l r25, r19, r19 // s2.38
+ ldx.w r20, r21, r21  // s2.14
+  ptabs r18, tr0
+ shari r19, 24, r19   // truncate to s2.14
+ sub r21, r19, r19    // some 11 bit inverse in s1.14
+ muls.l r19, r19, r21 // u0.28
+  sub r63, r1, r1
+  addi r1, 92, r1
+ muls.l r25, r21, r18 // s2.58
+ shlli r19, 45, r19   // multiply by two and convert to s2.58
+  /* bubble */
+ sub r19, r18, r18
+ shari r18, 28, r18   // some 22 bit inverse in s1.30
+ muls.l r18, r25, r0  // s2.60
+  muls.l r18, r4, r25 // s32.30
+  /* bubble */
+ shari r0, 16, r19   // s-16.44
+ muls.l r19, r18, r19 // s-16.74
+  shari r25, 63, r0
+  shari r4, 14, r18   // s19.-14
+ shari r19, 30, r19   // s-16.44
+ muls.l r19, r18, r19 // s15.30
+  xor r21, r0, r21    // You could also use the constant 1 << 27.
+  add r21, r25, r21
+ sub r21, r19, r21
+ shard r21, r1, r21
+ sub r21, r0, r0
+ blink tr0, r63
+#ifndef __pic__
+	ENDFUNC(GLOBAL(sdivsi3))
+#endif
+	ENDFUNC(GLOBAL(sdivsi3_2))
+#endif
+#elif defined __SHMEDIA__
+/* m5compact-nofpu */
+ // clobbered: r18,r19,r20,r21,r25,tr0,tr1,tr2
+	.mode	SHmedia
+	.section	.text..SHmedia32,"ax"
+	.align	2
+	FUNC(GLOBAL(sdivsi3))
+GLOBAL(sdivsi3):
+	pt/l LOCAL(sdivsi3_dontsub), tr0
+	pt/l LOCAL(sdivsi3_loop), tr1
+	ptabs/l r18,tr2
+	shari.l r4,31,r18
+	shari.l r5,31,r19
+	xor r4,r18,r20
+	xor r5,r19,r21
+	sub.l r20,r18,r20
+	sub.l r21,r19,r21
+	xor r18,r19,r19
+	shlli r21,32,r25
+	addi r25,-1,r21
+	addz.l r20,r63,r20
+LOCAL(sdivsi3_loop):
+	shlli r20,1,r20
+	bgeu/u r21,r20,tr0
+	sub r20,r21,r20
+LOCAL(sdivsi3_dontsub):
+	addi.l r25,-1,r25
+	bnei r25,-32,tr1
+	xor r20,r19,r20
+	sub.l r20,r19,r0
+	blink tr2,r63
+	ENDFUNC(GLOBAL(sdivsi3))
+#else /* ! __SHMEDIA__ */
+	FUNC(GLOBAL(sdivsi3))
+GLOBAL(sdivsi3):
+	mov	r4,r1
+	mov	r5,r0
+
+	tst	r0,r0
+	bt	div0
+	mov	#0,r2
+	div0s	r2,r1
+	subc	r3,r3
+	subc	r2,r1
+	div0s	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	div1	r0,r3
+	rotcl	r1
+	addc	r2,r1
+	rts
+	mov	r1,r0
+
+
+div0:	rts
+	mov	#0,r0
+
+	ENDFUNC(GLOBAL(sdivsi3))
+#endif /* ! __SHMEDIA__ */
+#endif /* ! __SH4__ */
+#endif
+#ifdef L_udivsi3_i4
+
+	.title "SH DIVIDE"
+!! 4 byte integer Divide code for the Renesas SH
+#ifdef __SH4__
+!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
+!! and t bit
+
+	.global	GLOBAL(udivsi3_i4)
+	HIDDEN_FUNC(GLOBAL(udivsi3_i4))
+GLOBAL(udivsi3_i4):
+	mov #1,r1
+	cmp/hi r1,r5
+	bf trivial
+	rotr r1
+	xor r1,r4
+	lds r4,fpul
+	mova L1,r0
+#ifdef FMOVD_WORKS
+	fmov.d @r0+,dr4
+#else
+	fmov.s @r0+,DR40
+	fmov.s @r0,DR41
+#endif
+	float fpul,dr0
+	xor r1,r5
+	lds r5,fpul
+	float fpul,dr2
+	fadd dr4,dr0
+	fadd dr4,dr2
+	fdiv dr2,dr0
+	rts
+	ftrc dr0,fpul
+
+trivial:
+	rts
+	lds r4,fpul
+
+	.align 2
+#ifdef FMOVD_WORKS
+	.align 3	! make double below 8 byte aligned.
+#endif
+L1:
+	.double 2147483648
+
+	ENDFUNC(GLOBAL(udivsi3_i4))
+#elif defined (__SH5__) && ! defined (__SH4_NOFPU__)
+#if ! __SH5__ || __SH5__ == 32
+!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
+	.mode	SHmedia
+	.global	GLOBAL(udivsi3_i4)
+	HIDDEN_FUNC(GLOBAL(udivsi3_i4))
+GLOBAL(udivsi3_i4):
+	addz.l	r4,r63,r20
+	addz.l	r5,r63,r21
+	fmov.qd	r20,dr0
+	fmov.qd	r21,dr32
+	ptabs	r18,tr0
+	float.qd dr0,dr0
+	float.qd dr32,dr32
+	fdiv.d	dr0,dr32,dr0
+	ftrc.dq dr0,dr32
+	fmov.s fr33,fr32
+	blink tr0,r63
+
+	ENDFUNC(GLOBAL(udivsi3_i4))
+#endif /* ! __SH5__ || __SH5__ == 32 */
+#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
+!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
+
+	.global	GLOBAL(udivsi3_i4)
+	HIDDEN_FUNC(GLOBAL(udivsi3_i4))
+GLOBAL(udivsi3_i4):
+	mov #1,r1
+	cmp/hi r1,r5
+	bf trivial
+	sts.l fpscr,@-r15
+	mova L1,r0
+	lds.l @r0+,fpscr
+	rotr r1
+	xor r1,r4
+	lds r4,fpul
+#ifdef FMOVD_WORKS
+	fmov.d @r0+,dr4
+#else
+	fmov.s @r0+,DR40
+	fmov.s @r0,DR41
+#endif
+	float fpul,dr0
+	xor r1,r5
+	lds r5,fpul
+	float fpul,dr2
+	fadd dr4,dr0
+	fadd dr4,dr2
+	fdiv dr2,dr0
+	ftrc dr0,fpul
+	rts
+	lds.l @r15+,fpscr
+
+#ifdef FMOVD_WORKS
+	.align 3	! make double below 8 byte aligned.
+#endif
+trivial:
+	rts
+	lds r4,fpul
+
+	.align 2
+L1:
+#ifndef FMOVD_WORKS
+	.long 0x80000
+#else
+	.long 0x180000
+#endif
+	.double 2147483648
+
+	ENDFUNC(GLOBAL(udivsi3_i4))
+#endif /* ! __SH4__ */
+#endif
+
+#ifdef L_udivsi3
+/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
+   sh2e/sh3e code.  */
+#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)
+
+!! args in r4 and r5, result in r0, clobbers r4, pr, and t bit
+	.global	GLOBAL(udivsi3)
+	HIDDEN_FUNC(GLOBAL(udivsi3))
+
+#if __SHMEDIA__
+#if __SH5__ == 32
+	.section	.text..SHmedia32,"ax"
+#else
+	.text
+#endif
+	.align	2
+#if 0
+/* The assembly code that follows is a hand-optimized version of the C
+   code that follows.  Note that the registers that are modified are
+   exactly those listed as clobbered in the patterns udivsi3_i1 and
+   udivsi3_i1_media.
+	
+unsigned 
+__udivsi3 (i, j)
+    unsigned i, j; 
+{
+  register unsigned long long r0 asm ("r0") = 0;
+  register unsigned long long r18 asm ("r18") = 1;
+  register unsigned long long r4 asm ("r4") = i;
+  register unsigned long long r19 asm ("r19") = j;
+
+  r19 <<= 31;
+  r18 <<= 31;
+  do
+    if (r4 >= r19)
+      r0 |= r18, r4 -= r19;
+  while (r19 >>= 1, r18 >>= 1);
+
+  return r0;
+}
+*/
+GLOBAL(udivsi3):
+	pt/l	LOCAL(udivsi3_dontadd), tr2
+	pt/l	LOCAL(udivsi3_loop), tr1
+	ptabs/l	r18, tr0
+	movi	0, r0
+	movi	1, r18
+	addz.l	r5, r63, r19
+	addz.l	r4, r63, r4
+	shlli	r19, 31, r19
+	shlli	r18, 31, r18
+LOCAL(udivsi3_loop):
+	bgtu	r19, r4, tr2
+	or	r0, r18, r0
+	sub	r4, r19, r4
+LOCAL(udivsi3_dontadd):
+	shlri	r18, 1, r18
+	shlri	r19, 1, r19
+	bnei	r18, 0, tr1
+	blink	tr0, r63
+#else
+GLOBAL(udivsi3):
+ // inputs: r4,r5
+ // clobbered: r18,r19,r20,r21,r22,r25,tr0
+ // result in r0.
+ addz.l r5,r63,r22
+ nsb r22,r0
+ shlld r22,r0,r25
+ shlri r25,48,r25
+ movi 0xffffffffffffbb0c,r20 // shift count eqiv 76
+ sub r20,r25,r21
+ mmulfx.w r21,r21,r19
+ mshflo.w r21,r63,r21
+ ptabs r18,tr0
+ mmulfx.w r25,r19,r19
+ sub r20,r0,r0
+ /* bubble */
+ msub.w r21,r19,r19
+ addi r19,-2,r21 /* It would be nice for scheduling to do this add to r21
+		    before the msub.w, but we need a different value for
+		    r19 to keep errors under control.  */
+ mulu.l r4,r21,r18
+ mmulfx.w r19,r19,r19
+ shlli r21,15,r21
+ shlrd r18,r0,r18
+ mulu.l r18,r22,r20
+ mmacnfx.wl r25,r19,r21
+ /* bubble */
+ sub r4,r20,r25
+
+ mulu.l r25,r21,r19
+ addi r0,14,r0
+ /* bubble */
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ add r18,r19,r18
+ /* bubble */
+ sub.l r25,r20,r25
+
+ mulu.l r25,r21,r19
+ addz.l r25,r63,r25
+ sub r25,r22,r25
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ addi r25,1,r25
+ add r18,r19,r18
+
+ cmpgt r25,r20,r25
+ add.l r18,r25,r0
+ blink tr0,r63
+#endif
+#elif defined (__SHMEDIA__)
+/* m5compact-nofpu - more emphasis on code size than on speed, but don't
+   ignore speed altogether - div1 needs 9 cycles, subc 7 and rotcl 4.
+   So use a short shmedia loop.  */
+ // clobbered: r20,r21,r25,tr0,tr1,tr2
+	.mode	SHmedia
+	.section	.text..SHmedia32,"ax"
+	.align	2
+GLOBAL(udivsi3):
+ pt/l LOCAL(udivsi3_dontsub), tr0
+ pt/l LOCAL(udivsi3_loop), tr1
+ ptabs/l r18,tr2
+ shlli r5,32,r25
+ addi r25,-1,r21
+ addz.l r4,r63,r20
+LOCAL(udivsi3_loop):
+ shlli r20,1,r20
+ bgeu/u r21,r20,tr0
+ sub r20,r21,r20
+LOCAL(udivsi3_dontsub):
+ addi.l r25,-1,r25
+ bnei r25,-32,tr1
+ add.l r20,r63,r0
+ blink tr2,r63
+#else /* ! defined (__SHMEDIA__) */
+LOCAL(div8):
+ div1 r5,r4
+LOCAL(div7):
+ div1 r5,r4; div1 r5,r4; div1 r5,r4
+ div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4
+
+LOCAL(divx4):
+ div1 r5,r4; rotcl r0
+ div1 r5,r4; rotcl r0
+ div1 r5,r4; rotcl r0
+ rts; div1 r5,r4
+
+GLOBAL(udivsi3):
+ sts.l pr,@-r15
+ extu.w r5,r0
+ cmp/eq r5,r0
+#ifdef __sh1__
+ bf LOCAL(large_divisor)
+#else
+ bf/s LOCAL(large_divisor)
+#endif
+ div0u
+ swap.w r4,r0
+ shlr16 r4
+ bsr LOCAL(div8)
+ shll16 r5
+ bsr LOCAL(div7)
+ div1 r5,r4
+ xtrct r4,r0
+ xtrct r0,r4
+ bsr LOCAL(div8)
+ swap.w r4,r4
+ bsr LOCAL(div7)
+ div1 r5,r4
+ lds.l @r15+,pr
+ xtrct r4,r0
+ swap.w r0,r0
+ rotcl r0
+ rts
+ shlr16 r5
+
+LOCAL(large_divisor):
+#ifdef __sh1__
+ div0u
+#endif
+ mov #0,r0
+ xtrct r4,r0
+ xtrct r0,r4
+ bsr LOCAL(divx4)
+ rotcl r0
+ bsr LOCAL(divx4)
+ rotcl r0
+ bsr LOCAL(divx4)
+ rotcl r0
+ bsr LOCAL(divx4)
+ rotcl r0
+ lds.l @r15+,pr
+ rts
+ rotcl r0
+
+	ENDFUNC(GLOBAL(udivsi3))
+#endif /* ! __SHMEDIA__ */
+#endif /* __SH4__ */
+#endif /* L_udivsi3 */
+
+#ifdef L_udivdi3
+#ifdef __SHMEDIA__
+	.mode	SHmedia
+	.section	.text..SHmedia32,"ax"
+	.align	2
+	.global	GLOBAL(udivdi3)
+	FUNC(GLOBAL(udivdi3))
+GLOBAL(udivdi3):
+	HIDDEN_ALIAS(udivdi3_internal,udivdi3)
+	shlri r3,1,r4
+	nsb r4,r22
+	shlld r3,r22,r6
+	shlri r6,49,r5
+	movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
+	sub r21,r5,r1
+	mmulfx.w r1,r1,r4
+	mshflo.w r1,r63,r1
+	sub r63,r22,r20 // r63 == 64 % 64
+	mmulfx.w r5,r4,r4
+	pta LOCAL(large_divisor),tr0
+	addi r20,32,r9
+	msub.w r1,r4,r1
+	madd.w r1,r1,r1
+	mmulfx.w r1,r1,r4
+	shlri r6,32,r7
+	bgt/u r9,r63,tr0 // large_divisor
+	mmulfx.w r5,r4,r4
+	shlri r2,32+14,r19
+	addi r22,-31,r0
+	msub.w r1,r4,r1
+
+	mulu.l r1,r7,r4
+	addi r1,-3,r5
+	mulu.l r5,r19,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+	                 the case may be, %0000000000000000 000.11111111111, still */
+	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+	mulu.l r5,r3,r8
+	mshalds.l r1,r21,r1
+	shari r4,26,r4
+	shlld r8,r0,r8
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	sub r2,r8,r2
+	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
+
+	shlri r2,22,r21
+	mulu.l r21,r1,r21
+	shlld r5,r0,r8
+	addi r20,30-22,r0
+	shlrd r21,r0,r21
+	mulu.l r21,r3,r5
+	add r8,r21,r8
+	mcmpgt.l r21,r63,r21 // See Note 1
+	addi r20,30,r0
+	mshfhi.l r63,r21,r21
+	sub r2,r5,r2
+	andc r2,r21,r2
+
+	/* small divisor: need a third divide step */
+	mulu.l r2,r1,r7
+	ptabs r18,tr0
+	addi r2,1,r2
+	shlrd r7,r0,r7
+	mulu.l r7,r3,r5
+	add r8,r7,r8
+	sub r2,r3,r2
+	cmpgt r2,r5,r5
+	add r8,r5,r2
+	/* could test r3 here to check for divide by zero.  */
+	blink tr0,r63
+
+LOCAL(large_divisor):
+	mmulfx.w r5,r4,r4
+	shlrd r2,r9,r25
+	shlri r25,32,r8
+	msub.w r1,r4,r1
+
+	mulu.l r1,r7,r4
+	addi r1,-3,r5
+	mulu.l r5,r8,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+	                 the case may be, %0000000000000000 000.11111111111, still */
+	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+	shlri r5,14-1,r8
+	mulu.l r8,r7,r5
+	mshalds.l r1,r21,r1
+	shari r4,26,r4
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	sub r25,r5,r25
+	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
+
+	shlri r25,22,r21
+	mulu.l r21,r1,r21
+	pta LOCAL(no_lo_adj),tr0
+	addi r22,32,r0
+	shlri r21,40,r21
+	mulu.l r21,r7,r5
+	add r8,r21,r8
+	shlld r2,r0,r2
+	sub r25,r5,r25
+	bgtu/u r7,r25,tr0 // no_lo_adj
+	addi r8,1,r8
+	sub r25,r7,r25
+LOCAL(no_lo_adj):
+	mextr4 r2,r25,r2
+
+	/* large_divisor: only needs a few adjustments.  */
+	mulu.l r8,r6,r5
+	ptabs r18,tr0
+	/* bubble */
+	cmpgtu r5,r2,r5
+	sub r8,r5,r2
+	blink tr0,r63
+	ENDFUNC(GLOBAL(udivdi3))
+/* Note 1: To shift the result of the second divide stage so that the result
+   always fits into 32 bits, yet we still reduce the rest sufficiently
+   would require a lot of instructions to do the shifts just right.  Using
+   the full 64 bit shift result to multiply with the divisor would require
+   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+   Fortunately, if the upper 32 bits of the shift result are nonzero, we
+   know that the rest after taking this partial result into account will
+   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
+   upper 32 bits of the partial result are nonzero.  */
+#endif /* __SHMEDIA__ */
+#endif /* L_udivdi3 */
+
+#ifdef L_divdi3
+#ifdef __SHMEDIA__
+	.mode	SHmedia
+	.section	.text..SHmedia32,"ax"
+	.align	2
+	.global	GLOBAL(divdi3)
+	FUNC(GLOBAL(divdi3))
+GLOBAL(divdi3):
+	pta GLOBAL(udivdi3_internal),tr0
+	shari r2,63,r22
+	shari r3,63,r23
+	xor r2,r22,r2
+	xor r3,r23,r3
+	sub r2,r22,r2
+	sub r3,r23,r3
+	beq/u r22,r23,tr0
+	ptabs r18,tr1
+	blink tr0,r18
+	sub r63,r2,r2
+	blink tr1,r63
+	ENDFUNC(GLOBAL(divdi3))
+#endif /* __SHMEDIA__ */
+#endif /* L_divdi3 */
+
+#ifdef L_umoddi3
+#ifdef __SHMEDIA__
+	.mode	SHmedia
+	.section	.text..SHmedia32,"ax"
+	.align	2
+	.global	GLOBAL(umoddi3)
+	FUNC(GLOBAL(umoddi3))
+GLOBAL(umoddi3):
+	HIDDEN_ALIAS(umoddi3_internal,umoddi3)
+	shlri r3,1,r4
+	nsb r4,r22
+	shlld r3,r22,r6
+	shlri r6,49,r5
+	movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
+	sub r21,r5,r1
+	mmulfx.w r1,r1,r4
+	mshflo.w r1,r63,r1
+	sub r63,r22,r20 // r63 == 64 % 64
+	mmulfx.w r5,r4,r4
+	pta LOCAL(large_divisor),tr0
+	addi r20,32,r9
+	msub.w r1,r4,r1
+	madd.w r1,r1,r1
+	mmulfx.w r1,r1,r4
+	shlri r6,32,r7
+	bgt/u r9,r63,tr0 // large_divisor
+	mmulfx.w r5,r4,r4
+	shlri r2,32+14,r19
+	addi r22,-31,r0
+	msub.w r1,r4,r1
+
+	mulu.l r1,r7,r4
+	addi r1,-3,r5
+	mulu.l r5,r19,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+	                 the case may be, %0000000000000000 000.11111111111, still */
+	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+	mulu.l r5,r3,r5
+	mshalds.l r1,r21,r1
+	shari r4,26,r4
+	shlld r5,r0,r5
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	sub r2,r5,r2
+	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
+
+	shlri r2,22,r21
+	mulu.l r21,r1,r21
+	addi r20,30-22,r0
+	/* bubble */ /* could test r3 here to check for divide by zero.  */
+	shlrd r21,r0,r21
+	mulu.l r21,r3,r5
+	mcmpgt.l r21,r63,r21 // See Note 1
+	addi r20,30,r0
+	mshfhi.l r63,r21,r21
+	sub r2,r5,r2
+	andc r2,r21,r2
+
+	/* small divisor: need a third divide step */
+	mulu.l r2,r1,r7
+	ptabs r18,tr0
+	sub r2,r3,r8 /* re-use r8 here for rest - r3 */
+	shlrd r7,r0,r7
+	mulu.l r7,r3,r5
+	/* bubble */
+	addi r8,1,r7
+	cmpgt r7,r5,r7
+	cmvne r7,r8,r2
+	sub r2,r5,r2
+	blink tr0,r63
+
+LOCAL(large_divisor):
+	mmulfx.w r5,r4,r4
+	shlrd r2,r9,r25
+	shlri r25,32,r8
+	msub.w r1,r4,r1
+
+	mulu.l r1,r7,r4
+	addi r1,-3,r5
+	mulu.l r5,r8,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+	                 the case may be, %0000000000000000 000.11111111111, still */
+	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+	shlri r5,14-1,r8
+	mulu.l r8,r7,r5
+	mshalds.l r1,r21,r1
+	shari r4,26,r4
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	sub r25,r5,r25
+	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
+
+	shlri r25,22,r21
+	mulu.l r21,r1,r21
+	pta LOCAL(no_lo_adj),tr0
+	addi r22,32,r0
+	shlri r21,40,r21
+	mulu.l r21,r7,r5
+	add r8,r21,r8
+	shlld r2,r0,r2
+	sub r25,r5,r25
+	bgtu/u r7,r25,tr0 // no_lo_adj
+	addi r8,1,r8
+	sub r25,r7,r25
+LOCAL(no_lo_adj):
+	mextr4 r2,r25,r2
+
+	/* large_divisor: only needs a few adjustments.  */
+	mulu.l r8,r6,r5
+	ptabs r18,tr0
+	add r2,r6,r7
+	cmpgtu r5,r2,r8
+	cmvne r8,r7,r2
+	sub r2,r5,r2
+	shlrd r2,r22,r2
+	blink tr0,r63
+	ENDFUNC(GLOBAL(umoddi3))
+/* Note 1: To shift the result of the second divide stage so that the result
+   always fits into 32 bits, yet we still reduce the rest sufficiently
+   would require a lot of instructions to do the shifts just right.  Using
+   the full 64 bit shift result to multiply with the divisor would require
+   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+   Fortunately, if the upper 32 bits of the shift result are nonzero, we
+   know that the rest after taking this partial result into account will
+   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
+   upper 32 bits of the partial result are nonzero.  */
+#endif /* __SHMEDIA__ */
+#endif /* L_umoddi3 */
+
+#ifdef L_moddi3
+#ifdef __SHMEDIA__
+	.mode	SHmedia
+	.section	.text..SHmedia32,"ax"
+	.align	2
+	.global	GLOBAL(moddi3)
+	FUNC(GLOBAL(moddi3))
+GLOBAL(moddi3):
+	pta GLOBAL(umoddi3_internal),tr0
+	shari r2,63,r22
+	shari r3,63,r23
+	xor r2,r22,r2
+	xor r3,r23,r3
+	sub r2,r22,r2
+	sub r3,r23,r3
+	beq/u r22,r63,tr0
+	ptabs r18,tr1
+	blink tr0,r18
+	sub r63,r2,r2
+	blink tr1,r63
+	ENDFUNC(GLOBAL(moddi3))
+#endif /* __SHMEDIA__ */
+#endif /* L_moddi3 */
+
+#ifdef L_set_fpscr
+#if !defined (__SH2A_NOFPU__)
+#if defined (__SH2E__) || defined (__SH2A__) || defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32
+#ifdef __SH5__
+	.mode	SHcompact
+#endif
+	.global GLOBAL(set_fpscr)
+	HIDDEN_FUNC(GLOBAL(set_fpscr))
+GLOBAL(set_fpscr):
+	lds r4,fpscr
+#ifdef __PIC__
+	mov.l	r12,@-r15
+#ifdef __vxworks
+	mov.l	LOCAL(set_fpscr_L0_base),r12
+	mov.l	LOCAL(set_fpscr_L0_index),r0
+	mov.l	@r12,r12
+	mov.l	@(r0,r12),r12
+#else
+	mova	LOCAL(set_fpscr_L0),r0
+	mov.l	LOCAL(set_fpscr_L0),r12
+	add	r0,r12
+#endif
+	mov.l	LOCAL(set_fpscr_L1),r0
+	mov.l	@(r0,r12),r1
+	mov.l	@r15+,r12
+#else
+	mov.l LOCAL(set_fpscr_L1),r1
+#endif
+	swap.w r4,r0
+	or #24,r0
+#ifndef FMOVD_WORKS
+	xor #16,r0
+#endif
+#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
+	swap.w r0,r3
+	mov.l r3,@(4,r1)
+#else /* defined (__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
+	swap.w r0,r2
+	mov.l r2,@r1
+#endif
+#ifndef FMOVD_WORKS
+	xor #8,r0
+#else
+	xor #24,r0
+#endif
+#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
+	swap.w r0,r2
+	rts
+	mov.l r2,@r1
+#else /* defined(__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
+	swap.w r0,r3
+	rts
+	mov.l r3,@(4,r1)
+#endif
+	.align 2
+#ifdef __PIC__
+#ifdef __vxworks
+LOCAL(set_fpscr_L0_base):
+	.long ___GOTT_BASE__
+LOCAL(set_fpscr_L0_index):
+	.long ___GOTT_INDEX__
+#else
+LOCAL(set_fpscr_L0):
+	.long _GLOBAL_OFFSET_TABLE_
+#endif
+LOCAL(set_fpscr_L1):
+	.long GLOBAL(fpscr_values@GOT)
+#else
+LOCAL(set_fpscr_L1):
+	.long GLOBAL(fpscr_values)
+#endif
+
+	ENDFUNC(GLOBAL(set_fpscr))
+#ifndef NO_FPSCR_VALUES
+#ifdef __ELF__
+        .comm   GLOBAL(fpscr_values),8,4
+#else
+        .comm   GLOBAL(fpscr_values),8
+#endif /* ELF */
+#endif /* NO_FPSCR_VALUES */
+#endif /* SH2E / SH3E / SH4 */
+#endif /* __SH2A_NOFPU__ */
+#endif /* L_set_fpscr */
+#ifdef L_ic_invalidate
+#if __SH5__ == 32
+	.mode	SHmedia
+	.section	.text..SHmedia32,"ax"
+	.align	2
+	.global	GLOBAL(init_trampoline)
+	HIDDEN_FUNC(GLOBAL(init_trampoline))
+GLOBAL(init_trampoline):
+	st.l	r0,8,r2
+#ifdef __LITTLE_ENDIAN__
+	movi	9,r20
+	shori	0x402b,r20
+	shori	0xd101,r20
+	shori	0xd002,r20
+#else
+	movi	0xffffffffffffd002,r20
+	shori	0xd101,r20
+	shori	0x402b,r20
+	shori	9,r20
+#endif
+	st.q	r0,0,r20
+	st.l	r0,12,r3
+	ENDFUNC(GLOBAL(init_trampoline))
+	.global	GLOBAL(ic_invalidate)
+	HIDDEN_FUNC(GLOBAL(ic_invalidate))
+GLOBAL(ic_invalidate):
+	ocbwb	r0,0
+	synco
+	icbi	r0, 0
+	ptabs	r18, tr0
+	synci
+	blink	tr0, r63
+	ENDFUNC(GLOBAL(ic_invalidate))
+#elif defined(__SH4A__)
+	.global GLOBAL(ic_invalidate)
+	HIDDEN_FUNC(GLOBAL(ic_invalidate))
+GLOBAL(ic_invalidate):
+	ocbwb	@r4
+	synco
+	icbi	@r4
+	rts
+	  nop
+	ENDFUNC(GLOBAL(ic_invalidate))
+#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
+	/* For system code, we use ic_invalidate_line_i, but user code
+	   needs a different mechanism.  A kernel call is generally not
+	   available, and it would also be slow.  Different SH4 variants use
+	   different sizes and associativities of the Icache.  We use a small
+	   bit of dispatch code that can be put hidden in every shared object,
+	   which calls the actual processor-specific invalidation code in a
+	   separate module.
+	   Or if you have operating system support, the OS could mmap the
+	   procesor-specific code from a single page, since it is highly
+	   repetitive.  */
+	.global GLOBAL(ic_invalidate)
+	HIDDEN_FUNC(GLOBAL(ic_invalidate))
+GLOBAL(ic_invalidate):
+#ifdef __pic__
+#ifdef __vxworks
+	mov.l	1f,r1
+	mov.l	2f,r0
+	mov.l	@r1,r1
+	mov.l	0f,r2
+	mov.l	@(r0,r1),r0
+#else
+	mov.l	1f,r1
+	mova	1f,r0
+	mov.l	0f,r2
+	add	r1,r0
+#endif
+	mov.l	@(r0,r2),r1
+#else
+	mov.l	0f,r1
+#endif
+	ocbwb	@r4
+	mov.l	@(8,r1),r0
+	sub	r1,r4
+	and	r4,r0
+	add	r1,r0
+	jmp	@r0
+	mov.l	@(4,r1),r0
+	.align	2
+#ifndef __pic__
+0:	.long   GLOBAL(ic_invalidate_array)
+#else /* __pic__ */
+	.global GLOBAL(ic_invalidate_array)
+0:	.long   GLOBAL(ic_invalidate_array)@GOT
+#ifdef __vxworks
+1:	.long	___GOTT_BASE__
+2:	.long	___GOTT_INDEX__
+#else
+1:	.long   _GLOBAL_OFFSET_TABLE_
+#endif
+	ENDFUNC(GLOBAL(ic_invalidate))
+#endif /* __pic__ */
+#endif /* SH4 */
+#endif /* L_ic_invalidate */
+
+#ifdef L_ic_invalidate_array
+#if defined(__SH4A__) || (defined (__FORCE_SH4A__) && (defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))))
+	.global GLOBAL(ic_invalidate_array)
+	/* This is needed when an SH4 dso with trampolines is used on SH4A.  */
+	.global GLOBAL(ic_invalidate_array)
+	FUNC(GLOBAL(ic_invalidate_array))
+GLOBAL(ic_invalidate_array):
+	add	r1,r4
+	synco
+	icbi	@r4
+	rts
+	  nop
+	.align 2
+	.long	0
+	ENDFUNC(GLOBAL(ic_invalidate_array))
+#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
+	.global GLOBAL(ic_invalidate_array)
+	.p2align 5
+	FUNC(GLOBAL(ic_invalidate_array))
+/* This must be aligned to the beginning of a cache line.  */
+GLOBAL(ic_invalidate_array):
+#ifndef WAYS
+#define WAYS 4
+#define WAY_SIZE 0x4000
+#endif
+#if WAYS == 1
+	.rept	WAY_SIZE * WAYS / 32
+	rts
+	nop
+	.rept	7
+	.long	WAY_SIZE - 32
+	.endr
+	.endr
+#elif WAYS <= 6
+	.rept	WAY_SIZE * WAYS / 32
+	braf	r0
+	add	#-8,r0
+	.long	WAY_SIZE + 8
+	.long	WAY_SIZE - 32
+	.rept	WAYS-2
+	braf	r0
+	nop
+	.endr
+	.rept	7 - WAYS
+	rts
+	nop
+	.endr
+	.endr
+#else /* WAYS > 6 */
+	/* This variant needs two different pages for mmap-ing.  */
+ 	.rept	WAYS-1
+	.rept	WAY_SIZE / 32
+	braf	r0
+	nop
+	.long	WAY_SIZE
+	.rept 6
+	.long	WAY_SIZE - 32
+	.endr
+	.endr
+	.endr
+	.rept	WAY_SIZE / 32
+	rts
+	.rept	15
+	nop
+	.endr
+	.endr
+#endif /* WAYS */
+	ENDFUNC(GLOBAL(ic_invalidate_array))
+#endif /* SH4 */
+#endif /* L_ic_invalidate_array */
+
+#if defined (__SH5__) && __SH5__ == 32
+#ifdef L_shcompact_call_trampoline
+	.section	.rodata
+	.align	1
+LOCAL(ct_main_table):
+.word	LOCAL(ct_r2_fp) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r2_ld) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r2_pop) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r3_fp) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r3_ld) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r3_pop) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r4_fp) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r4_ld) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r4_pop) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r5_fp) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r5_ld) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r5_pop) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r6_fph) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r6_fpl) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r6_ld) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r6_pop) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r7_fph) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r7_fpl) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r7_ld) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r7_pop) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r8_fph) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r8_fpl) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r8_ld) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r8_pop) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r9_fph) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r9_fpl) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r9_ld) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_ret_wide) - datalabel LOCAL(ct_main_label)
+.word	LOCAL(ct_call_func) - datalabel LOCAL(ct_main_label)
+	.mode	SHmedia
+	.section	.text..SHmedia32, "ax"
+	.align	2
+	
+     /* This function loads 64-bit general-purpose registers from the
+	stack, from a memory address contained in them or from an FP
+	register, according to a cookie passed in r1.  Its execution
+	time is linear on the number of registers that actually have
+	to be copied.  See sh.h for details on the actual bit pattern.
+
+	The function to be called is passed in r0.  If a 32-bit return
+	value is expected, the actual function will be tail-called,
+	otherwise the return address will be stored in r10 (that the
+	caller should expect to be clobbered) and the return value
+	will be expanded into r2/r3 upon return.  */
+	
+	.global	GLOBAL(GCC_shcompact_call_trampoline)
+	FUNC(GLOBAL(GCC_shcompact_call_trampoline))
+GLOBAL(GCC_shcompact_call_trampoline):
+	ptabs/l	r0, tr0	/* Prepare to call the actual function.  */
+	movi	((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0
+	pt/l	LOCAL(ct_loop), tr1
+	addz.l	r1, r63, r1
+	shori	((datalabel LOCAL(ct_main_table) - 31 * 2)) & 65535, r0
+LOCAL(ct_loop):
+	nsb	r1, r28
+	shlli	r28, 1, r29
+	ldx.w	r0, r29, r30
+LOCAL(ct_main_label):
+	ptrel/l	r30, tr2
+	blink	tr2, r63
+LOCAL(ct_r2_fp):	/* Copy r2 from an FP register.  */
+	/* It must be dr0, so just do it.  */
+	fmov.dq	dr0, r2
+	movi	7, r30
+	shlli	r30, 29, r31
+	andc	r1, r31, r1
+	blink	tr1, r63
+LOCAL(ct_r3_fp):	/* Copy r3 from an FP register.  */
+	/* It is either dr0 or dr2.  */
+	movi	7, r30
+	shlri	r1, 26, r32
+	shlli	r30, 26, r31
+	andc	r1, r31, r1
+	fmov.dq	dr0, r3
+	beqi/l	r32, 4, tr1
+	fmov.dq	dr2, r3
+	blink	tr1, r63
+LOCAL(ct_r4_fp):	/* Copy r4 from an FP register.  */
+	shlri	r1, 23 - 3, r34
+	andi	r34, 3 << 3, r33
+	addi	r33, LOCAL(ct_r4_fp_copy) - datalabel LOCAL(ct_r4_fp_base), r32
+LOCAL(ct_r4_fp_base):
+	ptrel/l	r32, tr2
+	movi	7, r30
+	shlli	r30, 23, r31
+	andc	r1, r31, r1
+	blink	tr2, r63
+LOCAL(ct_r4_fp_copy):
+	fmov.dq	dr0, r4
+	blink	tr1, r63
+	fmov.dq	dr2, r4
+	blink	tr1, r63
+	fmov.dq	dr4, r4
+	blink	tr1, r63
+LOCAL(ct_r5_fp):	/* Copy r5 from an FP register.  */
+	shlri	r1, 20 - 3, r34
+	andi	r34, 3 << 3, r33
+	addi	r33, LOCAL(ct_r5_fp_copy) - datalabel LOCAL(ct_r5_fp_base), r32
+LOCAL(ct_r5_fp_base):
+	ptrel/l	r32, tr2
+	movi	7, r30
+	shlli	r30, 20, r31
+	andc	r1, r31, r1
+	blink	tr2, r63
+LOCAL(ct_r5_fp_copy):
+	fmov.dq	dr0, r5
+	blink	tr1, r63
+	fmov.dq	dr2, r5
+	blink	tr1, r63
+	fmov.dq	dr4, r5
+	blink	tr1, r63
+	fmov.dq	dr6, r5
+	blink	tr1, r63
+LOCAL(ct_r6_fph):	/* Copy r6 from a high FP register.  */
+	/* It must be dr8.  */
+	fmov.dq	dr8, r6
+	movi	15, r30
+	shlli	r30, 16, r31
+	andc	r1, r31, r1
+	blink	tr1, r63
+LOCAL(ct_r6_fpl):	/* Copy r6 from a low FP register.  */
+	shlri	r1, 16 - 3, r34
+	andi	r34, 3 << 3, r33
+	addi	r33, LOCAL(ct_r6_fp_copy) - datalabel LOCAL(ct_r6_fp_base), r32
+LOCAL(ct_r6_fp_base):
+	ptrel/l	r32, tr2
+	movi	7, r30
+	shlli	r30, 16, r31
+	andc	r1, r31, r1
+	blink	tr2, r63
+LOCAL(ct_r6_fp_copy):
+	fmov.dq	dr0, r6
+	blink	tr1, r63
+	fmov.dq	dr2, r6
+	blink	tr1, r63
+	fmov.dq	dr4, r6
+	blink	tr1, r63
+	fmov.dq	dr6, r6
+	blink	tr1, r63
+LOCAL(ct_r7_fph):	/* Copy r7 from a high FP register.  */
+	/* It is either dr8 or dr10.  */
+	movi	15 << 12, r31
+	shlri	r1, 12, r32
+	andc	r1, r31, r1
+	fmov.dq	dr8, r7
+	beqi/l	r32, 8, tr1
+	fmov.dq	dr10, r7
+	blink	tr1, r63
+LOCAL(ct_r7_fpl):	/* Copy r7 from a low FP register.  */
+	shlri	r1, 12 - 3, r34
+	andi	r34, 3 << 3, r33
+	addi	r33, LOCAL(ct_r7_fp_copy) - datalabel LOCAL(ct_r7_fp_base), r32
+LOCAL(ct_r7_fp_base):
+	ptrel/l	r32, tr2
+	movi	7 << 12, r31
+	andc	r1, r31, r1
+	blink	tr2, r63
+LOCAL(ct_r7_fp_copy):
+	fmov.dq	dr0, r7
+	blink	tr1, r63
+	fmov.dq	dr2, r7
+	blink	tr1, r63
+	fmov.dq	dr4, r7
+	blink	tr1, r63
+	fmov.dq	dr6, r7
+	blink	tr1, r63
+LOCAL(ct_r8_fph):	/* Copy r8 from a high FP register.  */
+	/* It is either dr8 or dr10.  */
+	movi	15 << 8, r31
+	andi	r1, 1 << 8, r32
+	andc	r1, r31, r1
+	fmov.dq	dr8, r8
+	beq/l	r32, r63, tr1
+	fmov.dq	dr10, r8
+	blink	tr1, r63
+LOCAL(ct_r8_fpl):	/* Copy r8 from a low FP register.  */
+	shlri	r1, 8 - 3, r34
+	andi	r34, 3 << 3, r33
+	addi	r33, LOCAL(ct_r8_fp_copy) - datalabel LOCAL(ct_r8_fp_base), r32
+LOCAL(ct_r8_fp_base):
+	ptrel/l	r32, tr2
+	movi	7 << 8, r31
+	andc	r1, r31, r1
+	blink	tr2, r63
+LOCAL(ct_r8_fp_copy):
+	fmov.dq	dr0, r8
+	blink	tr1, r63
+	fmov.dq	dr2, r8
+	blink	tr1, r63
+	fmov.dq	dr4, r8
+	blink	tr1, r63
+	fmov.dq	dr6, r8
+	blink	tr1, r63
+LOCAL(ct_r9_fph):	/* Copy r9 from a high FP register.  */
+	/* It is either dr8 or dr10.  */
+	movi	15 << 4, r31
+	andi	r1, 1 << 4, r32
+	andc	r1, r31, r1
+	fmov.dq	dr8, r9
+	beq/l	r32, r63, tr1
+	fmov.dq	dr10, r9
+	blink	tr1, r63
+LOCAL(ct_r9_fpl):	/* Copy r9 from a low FP register.  */
+	shlri	r1, 4 - 3, r34
+	andi	r34, 3 << 3, r33
+	addi	r33, LOCAL(ct_r9_fp_copy) - datalabel LOCAL(ct_r9_fp_base), r32
+LOCAL(ct_r9_fp_base):
+	ptrel/l	r32, tr2
+	movi	7 << 4, r31
+	andc	r1, r31, r1
+	blink	tr2, r63
+LOCAL(ct_r9_fp_copy):
+	fmov.dq	dr0, r9
+	blink	tr1, r63
+	fmov.dq	dr2, r9
+	blink	tr1, r63
+	fmov.dq	dr4, r9
+	blink	tr1, r63
+	fmov.dq	dr6, r9
+	blink	tr1, r63
+LOCAL(ct_r2_ld):	/* Copy r2 from a memory address.  */
+	pt/l	LOCAL(ct_r2_load), tr2
+	movi	3, r30
+	shlli	r30, 29, r31
+	and	r1, r31, r32
+	andc	r1, r31, r1
+	beq/l	r31, r32, tr2
+	addi.l	r2, 8, r3
+	ldx.q	r2, r63, r2
+	/* Fall through.  */
+LOCAL(ct_r3_ld):	/* Copy r3 from a memory address.  */
+	pt/l	LOCAL(ct_r3_load), tr2
+	movi	3, r30
+	shlli	r30, 26, r31
+	and	r1, r31, r32
+	andc	r1, r31, r1
+	beq/l	r31, r32, tr2
+	addi.l	r3, 8, r4
+	ldx.q	r3, r63, r3
+LOCAL(ct_r4_ld):	/* Copy r4 from a memory address.  */
+	pt/l	LOCAL(ct_r4_load), tr2
+	movi	3, r30
+	shlli	r30, 23, r31
+	and	r1, r31, r32
+	andc	r1, r31, r1
+	beq/l	r31, r32, tr2
+	addi.l	r4, 8, r5
+	ldx.q	r4, r63, r4
+LOCAL(ct_r5_ld):	/* Copy r5 from a memory address.  */
+	pt/l	LOCAL(ct_r5_load), tr2
+	movi	3, r30
+	shlli	r30, 20, r31
+	and	r1, r31, r32
+	andc	r1, r31, r1
+	beq/l	r31, r32, tr2
+	addi.l	r5, 8, r6
+	ldx.q	r5, r63, r5
+LOCAL(ct_r6_ld):	/* Copy r6 from a memory address.  */
+	pt/l	LOCAL(ct_r6_load), tr2
+	movi	3 << 16, r31
+	and	r1, r31, r32
+	andc	r1, r31, r1
+	beq/l	r31, r32, tr2
+	addi.l	r6, 8, r7
+	ldx.q	r6, r63, r6
+LOCAL(ct_r7_ld):	/* Copy r7 from a memory address.  */
+	pt/l	LOCAL(ct_r7_load), tr2
+	movi	3 << 12, r31
+	and	r1, r31, r32
+	andc	r1, r31, r1
+	beq/l	r31, r32, tr2
+	addi.l	r7, 8, r8
+	ldx.q	r7, r63, r7
+LOCAL(ct_r8_ld):	/* Copy r8 from a memory address.  */
+	pt/l	LOCAL(ct_r8_load), tr2
+	movi	3 << 8, r31
+	and	r1, r31, r32
+	andc	r1, r31, r1
+	beq/l	r31, r32, tr2
+	addi.l	r8, 8, r9
+	ldx.q	r8, r63, r8
+LOCAL(ct_r9_ld):	/* Copy r9 from a memory address.  */
+	pt/l	LOCAL(ct_check_tramp), tr2
+	ldx.q	r9, r63, r9
+	blink	tr2, r63
+LOCAL(ct_r2_load):
+	ldx.q	r2, r63, r2
+	blink	tr1, r63
+LOCAL(ct_r3_load):
+	ldx.q	r3, r63, r3
+	blink	tr1, r63
+LOCAL(ct_r4_load):
+	ldx.q	r4, r63, r4
+	blink	tr1, r63
+LOCAL(ct_r5_load):
+	ldx.q	r5, r63, r5
+	blink	tr1, r63
+LOCAL(ct_r6_load):
+	ldx.q	r6, r63, r6
+	blink	tr1, r63
+LOCAL(ct_r7_load):
+	ldx.q	r7, r63, r7
+	blink	tr1, r63
+LOCAL(ct_r8_load):
+	ldx.q	r8, r63, r8
+	blink	tr1, r63
+LOCAL(ct_r2_pop):	/* Pop r2 from the stack.  */
+	movi	1, r30
+	ldx.q	r15, r63, r2
+	shlli	r30, 29, r31
+	addi.l	r15, 8, r15
+	andc	r1, r31, r1
+	blink	tr1, r63
+LOCAL(ct_r3_pop):	/* Pop r3 from the stack.  */
+	movi	1, r30
+	ldx.q	r15, r63, r3
+	shlli	r30, 26, r31
+	addi.l	r15, 8, r15
+	andc	r1, r31, r1
+	blink	tr1, r63
+LOCAL(ct_r4_pop):	/* Pop r4 from the stack.  */
+	movi	1, r30
+	ldx.q	r15, r63, r4
+	shlli	r30, 23, r31
+	addi.l	r15, 8, r15
+	andc	r1, r31, r1
+	blink	tr1, r63
+LOCAL(ct_r5_pop):	/* Pop r5 from the stack.  */
+	movi	1, r30
+	ldx.q	r15, r63, r5
+	shlli	r30, 20, r31
+	addi.l	r15, 8, r15
+	andc	r1, r31, r1
+	blink	tr1, r63
+LOCAL(ct_r6_pop):	/* Pop r6 from the stack.  */
+	movi	1, r30
+	ldx.q	r15, r63, r6
+	shlli	r30, 16, r31
+	addi.l	r15, 8, r15
+	andc	r1, r31, r1
+	blink	tr1, r63
+LOCAL(ct_r7_pop):	/* Pop r7 from the stack.  */
+	ldx.q	r15, r63, r7
+	movi	1 << 12, r31
+	addi.l	r15, 8, r15
+	andc	r1, r31, r1
+	blink	tr1, r63
+LOCAL(ct_r8_pop):	/* Pop r8 from the stack.  */
+	ldx.q	r15, r63, r8
+	movi	1 << 8, r31
+	addi.l	r15, 8, r15
+	andc	r1, r31, r1
+	blink	tr1, r63
+LOCAL(ct_pop_seq):	/* Pop a sequence of registers off the stack.  */
+	andi	r1, 7 << 1, r30
+	movi	(LOCAL(ct_end_of_pop_seq) >> 16) & 65535, r32
+	shlli	r30, 2, r31
+	shori	LOCAL(ct_end_of_pop_seq) & 65535, r32
+	sub.l	r32, r31, r33
+	ptabs/l	r33, tr2
+	blink	tr2, r63
+LOCAL(ct_start_of_pop_seq):	/* Beginning of pop sequence.  */
+	ldx.q	r15, r63, r3
+	addi.l	r15, 8, r15
+	ldx.q	r15, r63, r4
+	addi.l	r15, 8, r15
+	ldx.q	r15, r63, r5
+	addi.l	r15, 8, r15
+	ldx.q	r15, r63, r6
+	addi.l	r15, 8, r15
+	ldx.q	r15, r63, r7
+	addi.l	r15, 8, r15
+	ldx.q	r15, r63, r8
+	addi.l	r15, 8, r15
+LOCAL(ct_r9_pop):	/* Pop r9 from the stack.  */
+	ldx.q	r15, r63, r9
+	addi.l	r15, 8, r15
+LOCAL(ct_end_of_pop_seq): /* Label used to compute first pop instruction.  */
+LOCAL(ct_check_tramp):	/* Check whether we need a trampoline.  */
+	pt/u	LOCAL(ct_ret_wide), tr2
+	andi	r1, 1, r1
+	bne/u	r1, r63, tr2
+LOCAL(ct_call_func):	/* Just branch to the function.  */
+	blink	tr0, r63
+LOCAL(ct_ret_wide):	/* Call the function, so that we can unpack its 
+			   64-bit return value.  */
+	add.l	r18, r63, r10
+	blink	tr0, r18
+	ptabs	r10, tr0
+#if __LITTLE_ENDIAN__
+	shari	r2, 32, r3
+	add.l	r2, r63, r2
+#else
+	add.l	r2, r63, r3
+	shari	r2, 32, r2
+#endif
+	blink	tr0, r63
+
+	ENDFUNC(GLOBAL(GCC_shcompact_call_trampoline))
+#endif /* L_shcompact_call_trampoline */
+
+#ifdef L_shcompact_return_trampoline
+     /* This function does the converse of the code in `ret_wide'
+	above.  It is tail-called by SHcompact functions returning
+	64-bit non-floating-point values, to pack the 32-bit values in
+	r2 and r3 into r2.  */
+
+	.mode	SHmedia
+	.section	.text..SHmedia32, "ax"
+	.align	2
+	.global	GLOBAL(GCC_shcompact_return_trampoline)
+	HIDDEN_FUNC(GLOBAL(GCC_shcompact_return_trampoline))
+GLOBAL(GCC_shcompact_return_trampoline):
+	ptabs/l	r18, tr0
+#if __LITTLE_ENDIAN__
+	addz.l	r2, r63, r2
+	shlli	r3, 32, r3
+#else
+	addz.l	r3, r63, r3
+	shlli	r2, 32, r2
+#endif
+	or	r3, r2, r2
+	blink	tr0, r63
+
+	ENDFUNC(GLOBAL(GCC_shcompact_return_trampoline))
+#endif /* L_shcompact_return_trampoline */
+
+#ifdef L_shcompact_incoming_args
+	.section	.rodata
+	.align	1
+LOCAL(ia_main_table):
+.word	1 /* Invalid, just loop */
+.word	LOCAL(ia_r2_ld) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_r2_push) - datalabel LOCAL(ia_main_label)
+.word	1 /* Invalid, just loop */
+.word	LOCAL(ia_r3_ld) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_r3_push) - datalabel LOCAL(ia_main_label)
+.word	1 /* Invalid, just loop */
+.word	LOCAL(ia_r4_ld) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_r4_push) - datalabel LOCAL(ia_main_label)
+.word	1 /* Invalid, just loop */
+.word	LOCAL(ia_r5_ld) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_r5_push) - datalabel LOCAL(ia_main_label)
+.word	1 /* Invalid, just loop */
+.word	1 /* Invalid, just loop */
+.word	LOCAL(ia_r6_ld) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_r6_push) - datalabel LOCAL(ia_main_label)
+.word	1 /* Invalid, just loop */
+.word	1 /* Invalid, just loop */
+.word	LOCAL(ia_r7_ld) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_r7_push) - datalabel LOCAL(ia_main_label)
+.word	1 /* Invalid, just loop */
+.word	1 /* Invalid, just loop */
+.word	LOCAL(ia_r8_ld) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_r8_push) - datalabel LOCAL(ia_main_label)
+.word	1 /* Invalid, just loop */
+.word	1 /* Invalid, just loop */
+.word	LOCAL(ia_r9_ld) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_return) - datalabel LOCAL(ia_main_label)
+.word	LOCAL(ia_return) - datalabel LOCAL(ia_main_label)
+	.mode	SHmedia
+	.section	.text..SHmedia32, "ax"
+	.align	2
+	
+     /* This function stores 64-bit general-purpose registers back in
+	the stack, and loads the address in which each register
+	was stored into itself.  The lower 32 bits of r17 hold the address
+	to begin storing, and the upper 32 bits of r17 hold the cookie.
+	Its execution time is linear on the
+	number of registers that actually have to be copied, and it is
+	optimized for structures larger than 64 bits, as opposed to
+	individual `long long' arguments.  See sh.h for details on the
+	actual bit pattern.  */
+	
+	.global	GLOBAL(GCC_shcompact_incoming_args)
+ 	FUNC(GLOBAL(GCC_shcompact_incoming_args))
+GLOBAL(GCC_shcompact_incoming_args):
+	ptabs/l	r18, tr0	/* Prepare to return.  */
+	shlri	r17, 32, r0	/* Load the cookie.  */
+	movi	((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r43
+	pt/l	LOCAL(ia_loop), tr1
+	add.l	r17, r63, r17
+	shori	((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r43
+LOCAL(ia_loop):
+	nsb	r0, r36
+	shlli	r36, 1, r37
+	ldx.w	r43, r37, r38
+LOCAL(ia_main_label):
+	ptrel/l	r38, tr2
+	blink	tr2, r63
+LOCAL(ia_r2_ld):	/* Store r2 and load its address.  */
+	movi	3, r38
+	shlli	r38, 29, r39
+	and	r0, r39, r40
+	andc	r0, r39, r0
+	stx.q	r17, r63, r2
+	add.l	r17, r63, r2
+	addi.l	r17, 8, r17
+	beq/u	r39, r40, tr1
+LOCAL(ia_r3_ld):	/* Store r3 and load its address.  */
+	movi	3, r38
+	shlli	r38, 26, r39
+	and	r0, r39, r40
+	andc	r0, r39, r0
+	stx.q	r17, r63, r3
+	add.l	r17, r63, r3
+	addi.l	r17, 8, r17
+	beq/u	r39, r40, tr1
+LOCAL(ia_r4_ld):	/* Store r4 and load its address.  */
+	movi	3, r38
+	shlli	r38, 23, r39
+	and	r0, r39, r40
+	andc	r0, r39, r0
+	stx.q	r17, r63, r4
+	add.l	r17, r63, r4
+	addi.l	r17, 8, r17
+	beq/u	r39, r40, tr1
+LOCAL(ia_r5_ld):	/* Store r5 and load its address.  */
+	movi	3, r38
+	shlli	r38, 20, r39
+	and	r0, r39, r40
+	andc	r0, r39, r0
+	stx.q	r17, r63, r5
+	add.l	r17, r63, r5
+	addi.l	r17, 8, r17
+	beq/u	r39, r40, tr1
+LOCAL(ia_r6_ld):	/* Store r6 and load its address.  */
+	movi	3, r38
+	shlli	r38, 16, r39
+	and	r0, r39, r40
+	andc	r0, r39, r0
+	stx.q	r17, r63, r6
+	add.l	r17, r63, r6
+	addi.l	r17, 8, r17
+	beq/u	r39, r40, tr1
+LOCAL(ia_r7_ld):	/* Store r7 and load its address.  */
+	movi	3 << 12, r39
+	and	r0, r39, r40
+	andc	r0, r39, r0
+	stx.q	r17, r63, r7
+	add.l	r17, r63, r7
+	addi.l	r17, 8, r17
+	beq/u	r39, r40, tr1
+LOCAL(ia_r8_ld):	/* Store r8 and load its address.  */
+	movi	3 << 8, r39
+	and	r0, r39, r40
+	andc	r0, r39, r0
+	stx.q	r17, r63, r8
+	add.l	r17, r63, r8
+	addi.l	r17, 8, r17
+	beq/u	r39, r40, tr1
+LOCAL(ia_r9_ld):	/* Store r9 and load its address.  */
+	stx.q	r17, r63, r9
+	add.l	r17, r63, r9
+	blink	tr0, r63
+LOCAL(ia_r2_push):	/* Push r2 onto the stack.  */
+	movi	1, r38
+	shlli	r38, 29, r39
+	andc	r0, r39, r0
+	stx.q	r17, r63, r2
+	addi.l	r17, 8, r17
+	blink	tr1, r63
+LOCAL(ia_r3_push):	/* Push r3 onto the stack.  */
+	movi	1, r38
+	shlli	r38, 26, r39
+	andc	r0, r39, r0
+	stx.q	r17, r63, r3
+	addi.l	r17, 8, r17
+	blink	tr1, r63
+LOCAL(ia_r4_push):	/* Push r4 onto the stack.  */
+	movi	1, r38
+	shlli	r38, 23, r39
+	andc	r0, r39, r0
+	stx.q	r17, r63, r4
+	addi.l	r17, 8, r17
+	blink	tr1, r63
+LOCAL(ia_r5_push):	/* Push r5 onto the stack.  */
+	movi	1, r38
+	shlli	r38, 20, r39
+	andc	r0, r39, r0
+	stx.q	r17, r63, r5
+	addi.l	r17, 8, r17
+	blink	tr1, r63
+LOCAL(ia_r6_push):	/* Push r6 onto the stack.  */
+	movi	1, r38
+	shlli	r38, 16, r39
+	andc	r0, r39, r0
+	stx.q	r17, r63, r6
+	addi.l	r17, 8, r17
+	blink	tr1, r63
+LOCAL(ia_r7_push):	/* Push r7 onto the stack.  */
+	movi	1 << 12, r39
+	andc	r0, r39, r0
+	stx.q	r17, r63, r7
+	addi.l	r17, 8, r17
+	blink	tr1, r63
+LOCAL(ia_r8_push):	/* Push r8 onto the stack.  */
+	movi	1 << 8, r39
+	andc	r0, r39, r0
+	stx.q	r17, r63, r8
+	addi.l	r17, 8, r17
+	blink	tr1, r63
+LOCAL(ia_push_seq):	/* Push a sequence of registers onto the stack.  */
+	andi	r0, 7 << 1, r38
+	movi	(LOCAL(ia_end_of_push_seq) >> 16) & 65535, r40
+	shlli	r38, 2, r39
+	shori	LOCAL(ia_end_of_push_seq) & 65535, r40
+	sub.l	r40, r39, r41
+	ptabs/l	r41, tr2
+	blink	tr2, r63
+LOCAL(ia_stack_of_push_seq):	 /* Beginning of push sequence.  */
+	stx.q	r17, r63, r3
+	addi.l	r17, 8, r17
+	stx.q	r17, r63, r4
+	addi.l	r17, 8, r17
+	stx.q	r17, r63, r5
+	addi.l	r17, 8, r17
+	stx.q	r17, r63, r6
+	addi.l	r17, 8, r17
+	stx.q	r17, r63, r7
+	addi.l	r17, 8, r17
+	stx.q	r17, r63, r8
+	addi.l	r17, 8, r17
+LOCAL(ia_r9_push):	/* Push r9 onto the stack.  */
+	stx.q	r17, r63, r9
+LOCAL(ia_return):	/* Return.  */
+	blink	tr0, r63
+LOCAL(ia_end_of_push_seq): /* Label used to compute the first push instruction.  */
+	ENDFUNC(GLOBAL(GCC_shcompact_incoming_args))
+#endif /* L_shcompact_incoming_args */
+#endif
+#if __SH5__
+#ifdef L_nested_trampoline
+#if __SH5__ == 32
+	.section	.text..SHmedia32,"ax"
+#else
+	.text
+#endif
+	.align	3 /* It is copied in units of 8 bytes in SHmedia mode.  */
+	.global	GLOBAL(GCC_nested_trampoline)
+	HIDDEN_FUNC(GLOBAL(GCC_nested_trampoline))
+GLOBAL(GCC_nested_trampoline):
+	.mode	SHmedia
+	ptrel/u	r63, tr0
+	gettr	tr0, r0
+#if __SH5__ == 64
+	ld.q	r0, 24, r1
+#else
+	ld.l	r0, 24, r1
+#endif
+	ptabs/l	r1, tr1
+#if __SH5__ == 64
+	ld.q	r0, 32, r1
+#else
+	ld.l	r0, 28, r1
+#endif
+	blink	tr1, r63
+
+	ENDFUNC(GLOBAL(GCC_nested_trampoline))
+#endif /* L_nested_trampoline */
+#endif /* __SH5__ */
+#if __SH5__ == 32
+#ifdef L_push_pop_shmedia_regs
+	.section	.text..SHmedia32,"ax"
+	.mode	SHmedia
+	.align	2
+#ifndef __SH4_NOFPU__	
+	.global	GLOBAL(GCC_push_shmedia_regs)
+	FUNC(GLOBAL(GCC_push_shmedia_regs))
+GLOBAL(GCC_push_shmedia_regs):
+	addi.l	r15, -14*8, r15
+	fst.d	r15, 13*8, dr62
+	fst.d	r15, 12*8, dr60
+	fst.d	r15, 11*8, dr58
+	fst.d	r15, 10*8, dr56
+	fst.d	r15,  9*8, dr54
+	fst.d	r15,  8*8, dr52
+	fst.d	r15,  7*8, dr50
+	fst.d	r15,  6*8, dr48
+	fst.d	r15,  5*8, dr46
+	fst.d	r15,  4*8, dr44
+	fst.d	r15,  3*8, dr42
+	fst.d	r15,  2*8, dr40
+	fst.d	r15,  1*8, dr38
+	fst.d	r15,  0*8, dr36
+#else /* ! __SH4_NOFPU__ */
+	.global	GLOBAL(GCC_push_shmedia_regs_nofpu)
+	FUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
+GLOBAL(GCC_push_shmedia_regs_nofpu):
+#endif /* ! __SH4_NOFPU__ */
+	ptabs/l	r18, tr0
+	addi.l	r15, -27*8, r15
+	gettr	tr7, r62
+	gettr	tr6, r61
+	gettr	tr5, r60
+	st.q	r15, 26*8, r62
+	st.q	r15, 25*8, r61
+	st.q	r15, 24*8, r60
+	st.q	r15, 23*8, r59
+	st.q	r15, 22*8, r58
+	st.q	r15, 21*8, r57
+	st.q	r15, 20*8, r56
+	st.q	r15, 19*8, r55
+	st.q	r15, 18*8, r54
+	st.q	r15, 17*8, r53
+	st.q	r15, 16*8, r52
+	st.q	r15, 15*8, r51
+	st.q	r15, 14*8, r50
+	st.q	r15, 13*8, r49
+	st.q	r15, 12*8, r48
+	st.q	r15, 11*8, r47
+	st.q	r15, 10*8, r46
+	st.q	r15,  9*8, r45
+	st.q	r15,  8*8, r44
+	st.q	r15,  7*8, r35
+	st.q	r15,  6*8, r34
+	st.q	r15,  5*8, r33
+	st.q	r15,  4*8, r32
+	st.q	r15,  3*8, r31
+	st.q	r15,  2*8, r30
+	st.q	r15,  1*8, r29
+	st.q	r15,  0*8, r28
+	blink	tr0, r63
+#ifndef __SH4_NOFPU__	
+	ENDFUNC(GLOBAL(GCC_push_shmedia_regs))
+#else
+	ENDFUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
+#endif
+#ifndef __SH4_NOFPU__	
+	.global	GLOBAL(GCC_pop_shmedia_regs)
+	FUNC(GLOBAL(GCC_pop_shmedia_regs))
+GLOBAL(GCC_pop_shmedia_regs):
+	pt	.L0, tr1
+	movi	41*8, r0
+	fld.d	r15, 40*8, dr62
+	fld.d	r15, 39*8, dr60
+	fld.d	r15, 38*8, dr58
+	fld.d	r15, 37*8, dr56
+	fld.d	r15, 36*8, dr54
+	fld.d	r15, 35*8, dr52
+	fld.d	r15, 34*8, dr50
+	fld.d	r15, 33*8, dr48
+	fld.d	r15, 32*8, dr46
+	fld.d	r15, 31*8, dr44
+	fld.d	r15, 30*8, dr42
+	fld.d	r15, 29*8, dr40
+	fld.d	r15, 28*8, dr38
+	fld.d	r15, 27*8, dr36
+	blink	tr1, r63
+#else /* ! __SH4_NOFPU__	*/
+	.global	GLOBAL(GCC_pop_shmedia_regs_nofpu)
+	FUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
+GLOBAL(GCC_pop_shmedia_regs_nofpu):
+#endif /* ! __SH4_NOFPU__	*/
+	movi	27*8, r0
+.L0:
+	ptabs	r18, tr0
+	ld.q	r15, 26*8, r62
+	ld.q	r15, 25*8, r61
+	ld.q	r15, 24*8, r60
+	ptabs	r62, tr7
+	ptabs	r61, tr6
+	ptabs	r60, tr5
+	ld.q	r15, 23*8, r59
+	ld.q	r15, 22*8, r58
+	ld.q	r15, 21*8, r57
+	ld.q	r15, 20*8, r56
+	ld.q	r15, 19*8, r55
+	ld.q	r15, 18*8, r54
+	ld.q	r15, 17*8, r53
+	ld.q	r15, 16*8, r52
+	ld.q	r15, 15*8, r51
+	ld.q	r15, 14*8, r50
+	ld.q	r15, 13*8, r49
+	ld.q	r15, 12*8, r48
+	ld.q	r15, 11*8, r47
+	ld.q	r15, 10*8, r46
+	ld.q	r15,  9*8, r45
+	ld.q	r15,  8*8, r44
+	ld.q	r15,  7*8, r35
+	ld.q	r15,  6*8, r34
+	ld.q	r15,  5*8, r33
+	ld.q	r15,  4*8, r32
+	ld.q	r15,  3*8, r31
+	ld.q	r15,  2*8, r30
+	ld.q	r15,  1*8, r29
+	ld.q	r15,  0*8, r28
+	add.l	r15, r0, r15
+	blink	tr0, r63
+
+#ifndef __SH4_NOFPU__
+	ENDFUNC(GLOBAL(GCC_pop_shmedia_regs))
+#else
+	ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
+#endif
+#endif /* __SH5__ == 32 */
+#endif /* L_push_pop_shmedia_regs */
+
+#ifdef L_div_table
+#if __SH5__
+#if defined(__pic__) && defined(__SHMEDIA__)
+	.global	GLOBAL(sdivsi3)
+	FUNC(GLOBAL(sdivsi3))
+#if __SH5__ == 32
+	.section	.text..SHmedia32,"ax"
+#else
+	.text
+#endif
+#if 0
+/* ??? FIXME: Presumably due to a linker bug, exporting data symbols
+   in a text section does not work (at least for shared libraries):
+   the linker sets the LSB of the address as if this was SHmedia code.  */
+#define TEXT_DATA_BUG
+#endif
+	.align	2
+ // inputs: r4,r5
+ // clobbered: r1,r18,r19,r20,r21,r25,tr0
+ // result in r0
+ .global GLOBAL(sdivsi3)
+GLOBAL(sdivsi3):
+#ifdef TEXT_DATA_BUG
+ ptb datalabel Local_div_table,tr0
+#else
+ ptb GLOBAL(div_table_internal),tr0
+#endif
+ nsb r5, r1
+ shlld r5, r1, r25    // normalize; [-2 ..1, 1..2) in s2.62
+ shari r25, 58, r21   // extract 5(6) bit index (s2.4 with hole -1..1)
+ /* bubble */
+ gettr tr0,r20
+ ldx.ub r20, r21, r19 // u0.8
+ shari r25, 32, r25   // normalize to s2.30
+ shlli r21, 1, r21
+ muls.l r25, r19, r19 // s2.38
+ ldx.w r20, r21, r21  // s2.14
+  ptabs r18, tr0
+ shari r19, 24, r19   // truncate to s2.14
+ sub r21, r19, r19    // some 11 bit inverse in s1.14
+ muls.l r19, r19, r21 // u0.28
+  sub r63, r1, r1
+  addi r1, 92, r1
+ muls.l r25, r21, r18 // s2.58
+ shlli r19, 45, r19   // multiply by two and convert to s2.58
+  /* bubble */
+ sub r19, r18, r18
+ shari r18, 28, r18   // some 22 bit inverse in s1.30
+ muls.l r18, r25, r0  // s2.60
+  muls.l r18, r4, r25 // s32.30
+  /* bubble */
+ shari r0, 16, r19   // s-16.44
+ muls.l r19, r18, r19 // s-16.74
+  shari r25, 63, r0
+  shari r4, 14, r18   // s19.-14
+ shari r19, 30, r19   // s-16.44
+ muls.l r19, r18, r19 // s15.30
+  xor r21, r0, r21    // You could also use the constant 1 << 27.
+  add r21, r25, r21
+ sub r21, r19, r21
+ shard r21, r1, r21
+ sub r21, r0, r0
+ blink tr0, r63
+	ENDFUNC(GLOBAL(sdivsi3))
+/* This table has been generated by divtab.c .
+Defects for bias -330:
+   Max defect: 6.081536e-07 at -1.000000e+00
+   Min defect: 2.849516e-08 at 1.030651e+00
+   Max 2nd step defect: 9.606539e-12 at -1.000000e+00
+   Min 2nd step defect: 0.000000e+00 at 0.000000e+00
+   Defect at 1: 1.238659e-07
+   Defect at -2: 1.061708e-07 */
+#else /* ! __pic__ || ! __SHMEDIA__ */
+	.section	.rodata
+#endif /* __pic__ */
+#if defined(TEXT_DATA_BUG) && defined(__pic__) && defined(__SHMEDIA__)
+	.balign 2
+	.type	Local_div_table,@object
+	.size	Local_div_table,128
+/* negative division constants */
+	.word	-16638
+	.word	-17135
+	.word	-17737
+	.word	-18433
+	.word	-19103
+	.word	-19751
+	.word	-20583
+	.word	-21383
+	.word	-22343
+	.word	-23353
+	.word	-24407
+	.word	-25582
+	.word	-26863
+	.word	-28382
+	.word	-29965
+	.word	-31800
+/* negative division factors */
+	.byte	66
+	.byte	70
+	.byte	75
+	.byte	81
+	.byte	87
+	.byte	93
+	.byte	101
+	.byte	109
+	.byte	119
+	.byte	130
+	.byte	142
+	.byte	156
+	.byte	172
+	.byte	192
+	.byte	214
+	.byte	241
+	.skip 16
+Local_div_table:
+	.skip 16
+/* positive division factors */
+	.byte	241
+	.byte	214
+	.byte	192
+	.byte	172
+	.byte	156
+	.byte	142
+	.byte	130
+	.byte	119
+	.byte	109
+	.byte	101
+	.byte	93
+	.byte	87
+	.byte	81
+	.byte	75
+	.byte	70
+	.byte	66
+/* positive division constants */
+	.word	31801
+	.word	29966
+	.word	28383
+	.word	26864
+	.word	25583
+	.word	24408
+	.word	23354
+	.word	22344
+	.word	21384
+	.word	20584
+	.word	19752
+	.word	19104
+	.word	18434
+	.word	17738
+	.word	17136
+	.word	16639
+	.section	.rodata
+#endif /* TEXT_DATA_BUG */
+	.balign 2
+	.type	GLOBAL(div_table),@object
+	.size	GLOBAL(div_table),128
+/* negative division constants */
+	.word	-16638
+	.word	-17135
+	.word	-17737
+	.word	-18433
+	.word	-19103
+	.word	-19751
+	.word	-20583
+	.word	-21383
+	.word	-22343
+	.word	-23353
+	.word	-24407
+	.word	-25582
+	.word	-26863
+	.word	-28382
+	.word	-29965
+	.word	-31800
+/* negative division factors */
+	.byte	66
+	.byte	70
+	.byte	75
+	.byte	81
+	.byte	87
+	.byte	93
+	.byte	101
+	.byte	109
+	.byte	119
+	.byte	130
+	.byte	142
+	.byte	156
+	.byte	172
+	.byte	192
+	.byte	214
+	.byte	241
+	.skip 16
+	.global	GLOBAL(div_table)
+GLOBAL(div_table):
+	HIDDEN_ALIAS(div_table_internal,div_table)
+	.skip 16
+/* positive division factors */
+	.byte	241
+	.byte	214
+	.byte	192
+	.byte	172
+	.byte	156
+	.byte	142
+	.byte	130
+	.byte	119
+	.byte	109
+	.byte	101
+	.byte	93
+	.byte	87
+	.byte	81
+	.byte	75
+	.byte	70
+	.byte	66
+/* positive division constants */
+	.word	31801
+	.word	29966
+	.word	28383
+	.word	26864
+	.word	25583
+	.word	24408
+	.word	23354
+	.word	22344
+	.word	21384
+	.word	20584
+	.word	19752
+	.word	19104
+	.word	18434
+	.word	17738
+	.word	17136
+	.word	16639
+
+#elif defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__)
+/* This code used shld, thus is not suitable for SH1 / SH2.  */
+
+/* Signed / unsigned division without use of FPU, optimized for SH4.
+   Uses a lookup table for divisors in the range -128 .. +128, and
+   div1 with case distinction for larger divisors in three more ranges.
+   The code is lumped together with the table to allow the use of mova.  */
+#ifdef __LITTLE_ENDIAN__
+#define L_LSB 0
+#define L_LSWMSB 1
+#define L_MSWLSB 2
+#else
+#define L_LSB 3
+#define L_LSWMSB 2
+#define L_MSWLSB 1
+#endif
+
+	.balign 4
+	.global	GLOBAL(udivsi3_i4i)
+	FUNC(GLOBAL(udivsi3_i4i))
+GLOBAL(udivsi3_i4i):
+	mov.w LOCAL(c128_w), r1
+	div0u
+	mov r4,r0
+	shlr8 r0
+	cmp/hi r1,r5
+	extu.w r5,r1
+	bf LOCAL(udiv_le128)
+	cmp/eq r5,r1
+	bf LOCAL(udiv_ge64k)
+	shlr r0
+	mov r5,r1
+	shll16 r5
+	mov.l r4,@-r15
+	div1 r5,r0
+	mov.l r1,@-r15
+	div1 r5,r0
+	div1 r5,r0
+	bra LOCAL(udiv_25)
+	div1 r5,r0
+
+LOCAL(div_le128):
+	mova LOCAL(div_table_ix),r0
+	bra LOCAL(div_le128_2)
+	mov.b @(r0,r5),r1
+LOCAL(udiv_le128):
+	mov.l r4,@-r15
+	mova LOCAL(div_table_ix),r0
+	mov.b @(r0,r5),r1
+	mov.l r5,@-r15
+LOCAL(div_le128_2):
+	mova LOCAL(div_table_inv),r0
+	mov.l @(r0,r1),r1
+	mov r5,r0
+	tst #0xfe,r0
+	mova LOCAL(div_table_clz),r0
+	dmulu.l r1,r4
+	mov.b @(r0,r5),r1
+	bt/s LOCAL(div_by_1)
+	mov r4,r0
+	mov.l @r15+,r5
+	sts mach,r0
+	/* clrt */
+	addc r4,r0
+	mov.l @r15+,r4
+	rotcr r0
+	rts
+	shld r1,r0
+
+LOCAL(div_by_1_neg):
+	neg r4,r0
+LOCAL(div_by_1):
+	mov.l @r15+,r5
+	rts
+	mov.l @r15+,r4
+
+LOCAL(div_ge64k):
+	bt/s LOCAL(div_r8)
+	div0u
+	shll8 r5
+	bra LOCAL(div_ge64k_2)
+	div1 r5,r0
+LOCAL(udiv_ge64k):
+	cmp/hi r0,r5
+	mov r5,r1
+	bt LOCAL(udiv_r8)
+	shll8 r5
+	mov.l r4,@-r15
+	div1 r5,r0
+	mov.l r1,@-r15
+LOCAL(div_ge64k_2):
+	div1 r5,r0
+	mov.l LOCAL(zero_l),r1
+	.rept 4
+	div1 r5,r0
+	.endr
+	mov.l r1,@-r15
+	div1 r5,r0
+	mov.w LOCAL(m256_w),r1
+	div1 r5,r0
+	mov.b r0,@(L_LSWMSB,r15)
+	xor r4,r0
+	and r1,r0
+	bra LOCAL(div_ge64k_end)
+	xor r4,r0
+	
+LOCAL(div_r8):
+	shll16 r4
+	bra LOCAL(div_r8_2)
+	shll8 r4
+LOCAL(udiv_r8):
+	mov.l r4,@-r15
+	shll16 r4
+	clrt
+	shll8 r4
+	mov.l r5,@-r15
+LOCAL(div_r8_2):
+	rotcl r4
+	mov r0,r1
+	div1 r5,r1
+	mov r4,r0
+	rotcl r0
+	mov r5,r4
+	div1 r5,r1
+	.rept 5
+	rotcl r0; div1 r5,r1
+	.endr
+	rotcl r0
+	mov.l @r15+,r5
+	div1 r4,r1
+	mov.l @r15+,r4
+	rts
+	rotcl r0
+
+	ENDFUNC(GLOBAL(udivsi3_i4i))
+
+	.global	GLOBAL(sdivsi3_i4i)
+	FUNC(GLOBAL(sdivsi3_i4i))
+	/* This is link-compatible with a GLOBAL(sdivsi3) call,
+	   but we effectively clobber only r1.  */
+GLOBAL(sdivsi3_i4i):
+	mov.l r4,@-r15
+	cmp/pz r5
+	mov.w LOCAL(c128_w), r1
+	bt/s LOCAL(pos_divisor)
+	cmp/pz r4
+	mov.l r5,@-r15
+	neg r5,r5
+	bt/s LOCAL(neg_result)
+	cmp/hi r1,r5
+	neg r4,r4
+LOCAL(pos_result):
+	extu.w r5,r0
+	bf LOCAL(div_le128)
+	cmp/eq r5,r0
+	mov r4,r0
+	shlr8 r0
+	bf/s LOCAL(div_ge64k)
+	cmp/hi r0,r5
+	div0u
+	shll16 r5
+	div1 r5,r0
+	div1 r5,r0
+	div1 r5,r0
+LOCAL(udiv_25):
+	mov.l LOCAL(zero_l),r1
+	div1 r5,r0
+	div1 r5,r0
+	mov.l r1,@-r15
+	.rept 3
+	div1 r5,r0
+	.endr
+	mov.b r0,@(L_MSWLSB,r15)
+	xtrct r4,r0
+	swap.w r0,r0
+	.rept 8
+	div1 r5,r0
+	.endr
+	mov.b r0,@(L_LSWMSB,r15)
+LOCAL(div_ge64k_end):
+	.rept 8
+	div1 r5,r0
+	.endr
+	mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+	extu.b r0,r0
+	mov.l @r15+,r5
+	or r4,r0
+	mov.l @r15+,r4
+	rts
+	rotcl r0
+
+LOCAL(div_le128_neg):
+	tst #0xfe,r0
+	mova LOCAL(div_table_ix),r0
+	mov.b @(r0,r5),r1
+	mova LOCAL(div_table_inv),r0
+	bt/s LOCAL(div_by_1_neg)
+	mov.l @(r0,r1),r1
+	mova LOCAL(div_table_clz),r0
+	dmulu.l r1,r4
+	mov.b @(r0,r5),r1
+	mov.l @r15+,r5
+	sts mach,r0
+	/* clrt */
+	addc r4,r0
+	mov.l @r15+,r4
+	rotcr r0
+	shld r1,r0
+	rts
+	neg r0,r0
+
+LOCAL(pos_divisor):
+	mov.l r5,@-r15
+	bt/s LOCAL(pos_result)
+	cmp/hi r1,r5
+	neg r4,r4
+LOCAL(neg_result):
+	extu.w r5,r0
+	bf LOCAL(div_le128_neg)
+	cmp/eq r5,r0
+	mov r4,r0
+	shlr8 r0
+	bf/s LOCAL(div_ge64k_neg)
+	cmp/hi r0,r5
+	div0u
+	mov.l LOCAL(zero_l),r1
+	shll16 r5
+	div1 r5,r0
+	mov.l r1,@-r15
+	.rept 7
+	div1 r5,r0
+	.endr
+	mov.b r0,@(L_MSWLSB,r15)
+	xtrct r4,r0
+	swap.w r0,r0
+	.rept 8
+	div1 r5,r0
+	.endr
+	mov.b r0,@(L_LSWMSB,r15)
+LOCAL(div_ge64k_neg_end):
+	.rept 8
+	div1 r5,r0
+	.endr
+	mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+	extu.b r0,r1
+	mov.l @r15+,r5
+	or r4,r1
+LOCAL(div_r8_neg_end):
+	mov.l @r15+,r4
+	rotcl r1
+	rts
+	neg r1,r0
+
+LOCAL(div_ge64k_neg):
+	bt/s LOCAL(div_r8_neg)
+	div0u
+	shll8 r5
+	mov.l LOCAL(zero_l),r1
+	.rept 6
+	div1 r5,r0
+	.endr
+	mov.l r1,@-r15
+	div1 r5,r0
+	mov.w LOCAL(m256_w),r1
+	div1 r5,r0
+	mov.b r0,@(L_LSWMSB,r15)
+	xor r4,r0
+	and r1,r0
+	bra LOCAL(div_ge64k_neg_end)
+	xor r4,r0
+
+LOCAL(c128_w):
+	.word 128
+
+LOCAL(div_r8_neg):
+	clrt
+	shll16 r4
+	mov r4,r1
+	shll8 r1
+	mov r5,r4
+	.rept 7
+	rotcl r1; div1 r5,r0
+	.endr
+	mov.l @r15+,r5
+	rotcl r1
+	bra LOCAL(div_r8_neg_end)
+	div1 r4,r0
+
+LOCAL(m256_w):
+	.word 0xff00
+/* This table has been generated by divtab-sh4.c.  */
+	.balign 4
+LOCAL(div_table_clz):
+	.byte	0
+	.byte	1
+	.byte	0
+	.byte	-1
+	.byte	-1
+	.byte	-2
+	.byte	-2
+	.byte	-2
+	.byte	-2
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+/* Lookup table translating positive divisor to index into table of
+   normalized inverse.  N.B. the '0' entry is also the last entry of the
+ previous table, and causes an unaligned access for division by zero.  */
+LOCAL(div_table_ix):
+	.byte	-6
+	.byte	-128
+	.byte	-128
+	.byte	0
+	.byte	-128
+	.byte	-64
+	.byte	0
+	.byte	64
+	.byte	-128
+	.byte	-96
+	.byte	-64
+	.byte	-32
+	.byte	0
+	.byte	32
+	.byte	64
+	.byte	96
+	.byte	-128
+	.byte	-112
+	.byte	-96
+	.byte	-80
+	.byte	-64
+	.byte	-48
+	.byte	-32
+	.byte	-16
+	.byte	0
+	.byte	16
+	.byte	32
+	.byte	48
+	.byte	64
+	.byte	80
+	.byte	96
+	.byte	112
+	.byte	-128
+	.byte	-120
+	.byte	-112
+	.byte	-104
+	.byte	-96
+	.byte	-88
+	.byte	-80
+	.byte	-72
+	.byte	-64
+	.byte	-56
+	.byte	-48
+	.byte	-40
+	.byte	-32
+	.byte	-24
+	.byte	-16
+	.byte	-8
+	.byte	0
+	.byte	8
+	.byte	16
+	.byte	24
+	.byte	32
+	.byte	40
+	.byte	48
+	.byte	56
+	.byte	64
+	.byte	72
+	.byte	80
+	.byte	88
+	.byte	96
+	.byte	104
+	.byte	112
+	.byte	120
+	.byte	-128
+	.byte	-124
+	.byte	-120
+	.byte	-116
+	.byte	-112
+	.byte	-108
+	.byte	-104
+	.byte	-100
+	.byte	-96
+	.byte	-92
+	.byte	-88
+	.byte	-84
+	.byte	-80
+	.byte	-76
+	.byte	-72
+	.byte	-68
+	.byte	-64
+	.byte	-60
+	.byte	-56
+	.byte	-52
+	.byte	-48
+	.byte	-44
+	.byte	-40
+	.byte	-36
+	.byte	-32
+	.byte	-28
+	.byte	-24
+	.byte	-20
+	.byte	-16
+	.byte	-12
+	.byte	-8
+	.byte	-4
+	.byte	0
+	.byte	4
+	.byte	8
+	.byte	12
+	.byte	16
+	.byte	20
+	.byte	24
+	.byte	28
+	.byte	32
+	.byte	36
+	.byte	40
+	.byte	44
+	.byte	48
+	.byte	52
+	.byte	56
+	.byte	60
+	.byte	64
+	.byte	68
+	.byte	72
+	.byte	76
+	.byte	80
+	.byte	84
+	.byte	88
+	.byte	92
+	.byte	96
+	.byte	100
+	.byte	104
+	.byte	108
+	.byte	112
+	.byte	116
+	.byte	120
+	.byte	124
+	.byte	-128
+/* 1/64 .. 1/127, normalized.  There is an implicit leading 1 in bit 32.  */
+	.balign 4
+LOCAL(zero_l):
+	.long	0x0
+	.long	0xF81F81F9
+	.long	0xF07C1F08
+	.long	0xE9131AC0
+	.long	0xE1E1E1E2
+	.long	0xDAE6076C
+	.long	0xD41D41D5
+	.long	0xCD856891
+	.long	0xC71C71C8
+	.long	0xC0E07039
+	.long	0xBACF914D
+	.long	0xB4E81B4F
+	.long	0xAF286BCB
+	.long	0xA98EF607
+	.long	0xA41A41A5
+	.long	0x9EC8E952
+	.long	0x9999999A
+	.long	0x948B0FCE
+	.long	0x8F9C18FA
+	.long	0x8ACB90F7
+	.long	0x86186187
+	.long	0x81818182
+	.long	0x7D05F418
+	.long	0x78A4C818
+	.long	0x745D1746
+	.long	0x702E05C1
+	.long	0x6C16C16D
+	.long	0x68168169
+	.long	0x642C8591
+	.long	0x60581606
+	.long	0x5C9882BA
+	.long	0x58ED2309
+LOCAL(div_table_inv):
+	.long	0x55555556
+	.long	0x51D07EAF
+	.long	0x4E5E0A73
+	.long	0x4AFD6A06
+	.long	0x47AE147B
+	.long	0x446F8657
+	.long	0x41414142
+	.long	0x3E22CBCF
+	.long	0x3B13B13C
+	.long	0x38138139
+	.long	0x3521CFB3
+	.long	0x323E34A3
+	.long	0x2F684BDB
+	.long	0x2C9FB4D9
+	.long	0x29E4129F
+	.long	0x27350B89
+	.long	0x24924925
+	.long	0x21FB7813
+	.long	0x1F7047DD
+	.long	0x1CF06ADB
+	.long	0x1A7B9612
+	.long	0x18118119
+	.long	0x15B1E5F8
+	.long	0x135C8114
+	.long	0x11111112
+	.long	0xECF56BF
+	.long	0xC9714FC
+	.long	0xA6810A7
+	.long	0x8421085
+	.long	0x624DD30
+	.long	0x4104105
+	.long	0x2040811
+	/* maximum error: 0.987342 scaled: 0.921875*/
+
+	ENDFUNC(GLOBAL(sdivsi3_i4i))
+#endif /* SH3 / SH4 */
+
+#endif /* L_div_table */
+
+#ifdef L_udiv_qrnnd_16
+#if !__SHMEDIA__
+	HIDDEN_FUNC(GLOBAL(udiv_qrnnd_16))
+	/* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */
+	/* n1 < d, but n1 might be larger than d1.  */
+	.global GLOBAL(udiv_qrnnd_16)
+	.balign 8
+GLOBAL(udiv_qrnnd_16):
+	div0u
+	cmp/hi r6,r0
+	bt .Lots
+	.rept 16
+	div1 r6,r0 
+	.endr
+	extu.w r0,r1
+	bt 0f
+	add r6,r0
+0:	rotcl r1
+	mulu.w r1,r5
+	xtrct r4,r0
+	swap.w r0,r0
+	sts macl,r2
+	cmp/hs r2,r0
+	sub r2,r0
+	bt 0f
+	addc r5,r0
+	add #-1,r1
+	bt 0f
+1:	add #-1,r1
+	rts
+	add r5,r0
+	.balign 8
+.Lots:
+	sub r5,r0
+	swap.w r4,r1
+	xtrct r0,r1
+	clrt
+	mov r1,r0
+	addc r5,r0
+	mov #-1,r1
+	SL1(bf, 1b,
+	shlr16 r1)
+0:	rts
+	nop
+	ENDFUNC(GLOBAL(udiv_qrnnd_16))
+#endif /* !__SHMEDIA__ */
+#endif /* L_udiv_qrnnd_16 */
diff --git a/libgcc/config/sh/lib1funcs.h b/libgcc/config/sh/lib1funcs.h
new file mode 100644
index 00000000000..af4b41cc314
--- /dev/null
+++ b/libgcc/config/sh/lib1funcs.h
@@ -0,0 +1,76 @@
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006, 2009
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifdef __ELF__
+#define LOCAL(X)	.L_##X
+#define FUNC(X)		.type X,@function
+#define HIDDEN_FUNC(X)	FUNC(X); .hidden X
+#define HIDDEN_ALIAS(X,Y) ALIAS (X,Y); .hidden GLOBAL(X)
+#define ENDFUNC0(X)	.Lfe_##X: .size X,.Lfe_##X-X
+#define ENDFUNC(X)	ENDFUNC0(X)
+#else
+#define LOCAL(X)	L_##X
+#define FUNC(X)
+#define HIDDEN_FUNC(X)
+#define HIDDEN_ALIAS(X,Y) ALIAS (X,Y)
+#define ENDFUNC(X)
+#endif
+
+#define	CONCAT(A,B)	A##B
+#define	GLOBAL0(U,X)	CONCAT(U,__##X)
+#define	GLOBAL(X)	GLOBAL0(__USER_LABEL_PREFIX__,X)
+
+#define ALIAS(X,Y)	.global GLOBAL(X); .set GLOBAL(X),GLOBAL(Y)
+
+#if defined __SH2A__ && defined __FMOVD_ENABLED__
+#undef  FMOVD_WORKS
+#define FMOVD_WORKS
+#endif
+
+#ifdef __LITTLE_ENDIAN__
+#define DR00 fr1
+#define DR01 fr0
+#define DR20 fr3
+#define DR21 fr2
+#define DR40 fr5
+#define DR41 fr4
+#else /* !__LITTLE_ENDIAN__ */
+#define DR00 fr0
+#define DR01 fr1
+#define DR20 fr2
+#define DR21 fr3
+#define DR40 fr4
+#define DR41 fr5
+#endif /* !__LITTLE_ENDIAN__ */
+
+#ifdef __sh1__
+#define SL(branch, dest, in_slot, in_slot_arg2) \
+	in_slot, in_slot_arg2; branch dest
+#define SL1(branch, dest, in_slot) \
+	in_slot; branch dest
+#else /* ! __sh1__ */
+#define SL(branch, dest, in_slot, in_slot_arg2) \
+	branch##.s dest; in_slot, in_slot_arg2
+#define SL1(branch, dest, in_slot) \
+	branch##/s dest; in_slot
+#endif /* !__sh1__ */
diff --git a/libgcc/config/sh/t-linux b/libgcc/config/sh/t-linux
index af618e260c6..9b1feacd1f3 100644
--- a/libgcc/config/sh/t-linux
+++ b/libgcc/config/sh/t-linux
@@ -1,3 +1,5 @@
+LIB1ASMFUNCS_CACHE = _ic_invalidate _ic_invalidate_array
+
 HOST_LIBGCC2_CFLAGS = -fpic -mieee -DNO_FPSCR_VALUES
 
 # Override t-slibgcc-elf-ver to export some libgcc symbols with
diff --git a/libgcc/config/sh/t-netbsd b/libgcc/config/sh/t-netbsd
new file mode 100644
index 00000000000..663edbf4187
--- /dev/null
+++ b/libgcc/config/sh/t-netbsd
@@ -0,0 +1 @@
+LIB1ASMFUNCS_CACHE = _ic_invalidate
diff --git a/libgcc/config/sh/t-sh b/libgcc/config/sh/t-sh
index ab4d98089b1..2319adbef1d 100644
--- a/libgcc/config/sh/t-sh
+++ b/libgcc/config/sh/t-sh
@@ -17,26 +17,33 @@
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
+LIB1ASMSRC = sh/lib1funcs.S
+LIB1ASMFUNCS = _ashiftrt _ashiftrt_n _ashiftlt _lshiftrt _movmem \
+  _movmem_i4 _mulsi3 _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \
+  _div_table _udiv_qrnnd_16 \
+  $(LIB1ASMFUNCS_CACHE)
+LIB1ASMFUNCS_CACHE = _ic_invalidate _ic_invalidate_array
+
 crt1.o: $(srcdir)/config/sh/crt1.S
 	$(gcc_compile) -c $<
 
-ic_invalidate_array_4-100.o: $(gcc_srcdir)/config/sh/lib1funcs.asm
+ic_invalidate_array_4-100.o: $(srcdir)/config/sh/lib1funcs.S
 	$(gcc_compile) -c -DL_ic_invalidate_array -DWAYS=1 -DWAY_SIZE=0x2000 $<
 libic_invalidate_array_4-100.a: ic_invalidate_array_4-100.o
 	$(AR_CREATE_FOR_TARGET) $@ $<
 
-ic_invalidate_array_4-200.o: $(gcc_srcdir)/config/sh/lib1funcs.asm
+ic_invalidate_array_4-200.o: $(srcdir)/config/sh/lib1funcs.S
 	$(gcc_compile) -c -DL_ic_invalidate_array -DWAYS=2 -DWAY_SIZE=0x2000 $<
 libic_invalidate_array_4-200.a: ic_invalidate_array_4-200.o
 	$(AR_CREATE_FOR_TARGET) $@ $<
 
-ic_invalidate_array_4a.o: $(gcc_srcdir)/config/sh/lib1funcs.asm
+ic_invalidate_array_4a.o: $(srcdir)/config/sh/lib1funcs.S
 	$(gcc_compile) -c -DL_ic_invalidate_array -D__FORCE_SH4A__ $<
 libic_invalidate_array_4a.a: ic_invalidate_array_4a.o
 	$(AR_CREATE_FOR_TARGET) $@ $<
 
 sdivsi3_i4i-Os-4-200.o: $(srcdir)/config/sh/lib1funcs-Os-4-200.S
-	$(gcc_compile) -c -DL_sdivsi3_i4i $<
+	$(compile) -c -DL_sdivsi3_i4i $<
 udivsi3_i4i-Os-4-200.o: $(srcdir)/config/sh/lib1funcs-Os-4-200.S
 	$(gcc_compile) -c -DL_udivsi3_i4i $<
 unwind-dw2-Os-4-200.o: $(gcc_srcdir)/unwind-dw2.c
diff --git a/libgcc/config/sh/t-sh64 b/libgcc/config/sh/t-sh64
new file mode 100644
index 00000000000..fa9950e03b2
--- /dev/null
+++ b/libgcc/config/sh/t-sh64
@@ -0,0 +1,6 @@
+LIB1ASMFUNCS = \
+  _sdivsi3 _sdivsi3_i4 _udivsi3 _udivsi3_i4 _set_fpscr \
+  _shcompact_call_trampoline _shcompact_return_trampoline \
+  _shcompact_incoming_args _ic_invalidate _nested_trampoline \
+  _push_pop_shmedia_regs \
+  _udivdi3 _divdi3 _umoddi3 _moddi3 _div_table
diff --git a/libgcc/config/sparc/lb1spc.S b/libgcc/config/sparc/lb1spc.S
new file mode 100644
index 00000000000..b60bd5740e7
--- /dev/null
+++ b/libgcc/config/sparc/lb1spc.S
@@ -0,0 +1,784 @@
+/* This is an assembly language implementation of mulsi3, divsi3, and modsi3
+   for the sparc processor.
+
+   These routines are derived from the SPARC Architecture Manual, version 8,
+   slightly edited to match the desired calling convention, and also to
+   optimize them for our purposes.  */
+
+#ifdef L_mulsi3
+.text
+	.align 4
+	.global .umul
+	.proc 4
+.umul:
+	or	%o0, %o1, %o4	! logical or of multiplier and multiplicand
+	mov	%o0, %y		! multiplier to Y register
+	andncc	%o4, 0xfff, %o5	! mask out lower 12 bits
+	be	mul_shortway	! can do it the short way
+	andcc	%g0, %g0, %o4	! zero the partial product and clear NV cc
+	!
+	! long multiply
+	!
+	mulscc	%o4, %o1, %o4	! first iteration of 33
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4	! 32nd iteration
+	mulscc	%o4, %g0, %o4	! last iteration only shifts
+	! the upper 32 bits of product are wrong, but we do not care
+	retl
+	rd	%y, %o0
+	!
+	! short multiply
+	!
+mul_shortway:
+	mulscc	%o4, %o1, %o4	! first iteration of 13
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4
+	mulscc	%o4, %o1, %o4	! 12th iteration
+	mulscc	%o4, %g0, %o4	! last iteration only shifts
+	rd	%y, %o5
+	sll	%o4, 12, %o4	! left shift partial product by 12 bits
+	srl	%o5, 20, %o5	! right shift partial product by 20 bits
+	retl
+	or	%o5, %o4, %o0	! merge for true product
+#endif
+
+#ifdef L_divsi3
+/*
+ * Division and remainder, from Appendix E of the SPARC Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .div	name of function to generate
+ *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
+ *  true		true=true => signed; true=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+        .global .udiv
+        .align 4
+        .proc 4
+        .text
+.udiv:
+         b ready_to_divide
+         mov 0, %g3             ! result is always positive
+
+        .global .div
+        .align 4
+        .proc 4
+        .text
+.div:
+	! compute sign of result; if neither is negative, no problem
+	orcc	%o1, %o0, %g0	! either negative?
+	bge	ready_to_divide	! no, go do the divide
+	xor	%o1, %o0, %g3	! compute sign in any case
+	tst	%o1
+	bge	1f
+	tst	%o0
+	! %o1 is definitely negative; %o0 might also be negative
+	bge	ready_to_divide	! if %o0 not negative...
+	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
+1:	! %o0 is negative, %o1 is nonnegative
+	sub	%g0, %o0, %o0	! make %o0 nonnegative
+
+
+ready_to_divide:
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	mov	%o0, %o3
+
+	! Divide by zero trap.  If it returns, return 0 (about as
+	! wrong as possible, but that is what SunOS does...).
+	ta	0x2    		! ST_DIV0
+	retl
+	clr	%o0
+
+1:
+	cmp	%o3, %o5		! if %o1 exceeds %o0, done
+	blu	got_result		! (and algorithm fails otherwise)
+	clr	%o2
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+	cmp	%o3, %g1
+	blu	not_really_big
+	clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		mov	1, %g2
+		sll	%o5, 4, %o5
+		b	1b
+		add	%o4, 1, %o4
+
+	! Now compute %g2.
+	2:	addcc	%o5, %o5, %o5
+		bcc	not_too_big
+		add	%g2, 1, %g2
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5	! rest of %o5
+		add	%o5, %g1, %o5
+		b	do_single_div
+		sub	%g2, 1, %g2
+
+	not_too_big:
+	3:	cmp	%o5, %o3
+		blu	2b
+		nop
+		be	do_single_div
+		nop
+	/* NB: these are commented out in the V8-SPARC manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g2
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	do_single_div:
+		subcc	%g2, 1, %g2
+		bl	end_regular_divide
+		nop
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+		b	end_single_divloop
+		nop
+	single_divloop:
+		sll	%o2, 1, %o2
+		bl	1f
+		srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	end_single_divloop:
+		subcc	%g2, 1, %g2
+		bge	single_divloop
+		tst	%o3
+		b,a	end_regular_divide
+
+not_really_big:
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	addcc	%o4, 1, %o4
+	be	got_result
+	sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+divloop:
+	sll	%o2, 4, %o2
+	! depth 1, accumulated bits 0
+	bl	L1.16
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 2, accumulated bits 1
+	bl	L2.17
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 3, accumulated bits 3
+	bl	L3.19
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 4, accumulated bits 7
+	bl	L4.23
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (7*2+1), %o2
+	
+L4.23:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (7*2-1), %o2
+	
+	
+L3.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 4, accumulated bits 5
+	bl	L4.21
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (5*2+1), %o2
+	
+L4.21:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (5*2-1), %o2
+	
+L2.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 3, accumulated bits 1
+	bl	L3.17
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 4, accumulated bits 3
+	bl	L4.19
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (3*2+1), %o2
+	
+L4.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (3*2-1), %o2
+
+L3.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 4, accumulated bits 1
+	bl	L4.17
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (1*2+1), %o2
+
+L4.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (1*2-1), %o2
+	
+L1.16:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 2, accumulated bits -1
+	bl	L2.15
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 3, accumulated bits -1
+	bl	L3.15
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 4, accumulated bits -1
+	bl	L4.15
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-1*2+1), %o2
+	
+L4.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-1*2-1), %o2
+	
+L3.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 4, accumulated bits -3
+	bl	L4.13
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-3*2+1), %o2
+	
+L4.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-3*2-1), %o2
+	
+L2.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 3, accumulated bits -3
+	bl	L3.13
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 4, accumulated bits -5
+	bl	L4.11
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-5*2+1), %o2
+	
+L4.11:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-5*2-1), %o2
+	
+L3.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 4, accumulated bits -7
+	bl	L4.9
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-7*2+1), %o2
+
+L4.9:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-7*2-1), %o2
+	
+	9:
+end_regular_divide:
+	subcc	%o4, 1, %o4
+	bge	divloop
+	tst	%o3
+	bl,a	got_result
+	! non-restoring fixup here (one instruction only!)
+	sub	%o2, 1, %o2
+
+
+got_result:
+	! check to see if answer should be < 0
+	tst	%g3
+	bl,a	1f
+	sub %g0, %o2, %o2
+1:
+	retl
+	mov %o2, %o0
+#endif
+
+#ifdef L_modsi3
+/* This implementation was taken from glibc:
+ *
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+.text
+	.align 4
+	.global	.urem
+	.proc 4
+.urem:
+	b	divide
+	mov	0, %g3		! result always positive
+
+        .align 4
+	.global .rem
+	.proc 4
+.rem:
+	! compute sign of result; if neither is negative, no problem
+	orcc	%o1, %o0, %g0	! either negative?
+	bge	2f			! no, go do the divide
+	mov	%o0, %g3		! sign of remainder matches %o0
+	tst	%o1
+	bge	1f
+	tst	%o0
+	! %o1 is definitely negative; %o0 might also be negative
+	bge	2f			! if %o0 not negative...
+	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
+1:	! %o0 is negative, %o1 is nonnegative
+	sub	%g0, %o0, %o0	! make %o0 nonnegative
+2:
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+divide:
+	orcc	%o1, %g0, %o5
+	bne	1f
+	mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	0x2   !ST_DIV0
+		retl
+		clr	%o0
+
+1:
+	cmp	%o3, %o5		! if %o1 exceeds %o0, done
+	blu	got_result		! (and algorithm fails otherwise)
+	clr	%o2
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+	cmp	%o3, %g1
+	blu	not_really_big
+	clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		mov	1, %g2
+		sll	%o5, 4, %o5
+		b	1b
+		add	%o4, 1, %o4
+
+	! Now compute %g2.
+	2:	addcc	%o5, %o5, %o5
+		bcc	not_too_big
+		add	%g2, 1, %g2
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+		b	do_single_div
+		sub	%g2, 1, %g2
+
+	not_too_big:
+	3:	cmp	%o5, %o3
+		blu	2b
+		nop
+		be	do_single_div
+		nop
+	/* NB: these are commented out in the V8-SPARC manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g2
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	do_single_div:
+		subcc	%g2, 1, %g2
+		bl	end_regular_divide
+		nop
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+		b	end_single_divloop
+		nop
+	single_divloop:
+		sll	%o2, 1, %o2
+		bl	1f
+		srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	end_single_divloop:
+		subcc	%g2, 1, %g2
+		bge	single_divloop
+		tst	%o3
+		b,a	end_regular_divide
+
+not_really_big:
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	addcc	%o4, 1, %o4
+	be	got_result
+	sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+divloop:
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	L1.16
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 2, accumulated bits 1
+	bl	L2.17
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 3, accumulated bits 3
+	bl	L3.19
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 4, accumulated bits 7
+	bl	L4.23
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (7*2+1), %o2
+L4.23:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (7*2-1), %o2
+	
+L3.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 4, accumulated bits 5
+	bl	L4.21
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (5*2+1), %o2
+	
+L4.21:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (5*2-1), %o2
+	
+L2.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 3, accumulated bits 1
+	bl	L3.17
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 4, accumulated bits 3
+	bl	L4.19
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (3*2+1), %o2
+	
+L4.19:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (3*2-1), %o2
+	
+L3.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 4, accumulated bits 1
+	bl	L4.17
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (1*2+1), %o2
+	
+L4.17:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (1*2-1), %o2
+	
+L1.16:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 2, accumulated bits -1
+	bl	L2.15
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 3, accumulated bits -1
+	bl	L3.15
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 4, accumulated bits -1
+	bl	L4.15
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-1*2+1), %o2
+	
+L4.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-1*2-1), %o2
+	
+L3.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 4, accumulated bits -3
+	bl	L4.13
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-3*2+1), %o2
+	
+L4.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-3*2-1), %o2
+	
+L2.15:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 3, accumulated bits -3
+	bl	L3.13
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	! depth 4, accumulated bits -5
+	bl	L4.11
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-5*2+1), %o2
+	
+L4.11:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-5*2-1), %o2
+	
+L3.13:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	! depth 4, accumulated bits -7
+	bl	L4.9
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-7*2+1), %o2
+	
+L4.9:
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+	b	9f
+	add	%o2, (-7*2-1), %o2
+	
+	9:
+end_regular_divide:
+	subcc	%o4, 1, %o4
+	bge	divloop
+	tst	%o3
+	bl,a	got_result
+	! non-restoring fixup here (one instruction only!)
+	add	%o3, %o1, %o3
+
+got_result:
+	! check to see if answer should be < 0
+	tst	%g3
+	bl,a	1f
+	sub %g0, %o3, %o3
+1:
+	retl
+	mov %o3, %o0
+
+#endif
+
diff --git a/libgcc/config/sparc/t-softmul b/libgcc/config/sparc/t-softmul
index 49faae47c53..7142200600f 100644
--- a/libgcc/config/sparc/t-softmul
+++ b/libgcc/config/sparc/t-softmul
@@ -1,2 +1,2 @@
-LIB1ASMSRC = sparc/lb1spc.asm
+LIB1ASMSRC = sparc/lb1spc.S
 LIB1ASMFUNCS = _mulsi3 _divsi3 _modsi3
diff --git a/libgcc/config/v850/lib1funcs.S b/libgcc/config/v850/lib1funcs.S
new file mode 100644
index 00000000000..04e9b1e0ad4
--- /dev/null
+++ b/libgcc/config/v850/lib1funcs.S
@@ -0,0 +1,2330 @@
+/* libgcc routines for NEC V850.
+   Copyright (C) 1996, 1997, 2002, 2005, 2009, 2010
+   Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifdef L_mulsi3
+	.text
+	.globl ___mulsi3
+	.type  ___mulsi3,@function
+___mulsi3:
+#ifdef __v850__	
+/*
+   #define SHIFT 12
+   #define MASK ((1 << SHIFT) - 1)
+    
+   #define STEP(i, j)                               \
+   ({                                               \
+       short a_part = (a >> (i)) & MASK;            \
+       short b_part = (b >> (j)) & MASK;            \
+       int res = (((int) a_part) * ((int) b_part)); \
+       res;                                         \
+   })
+  
+   int
+   __mulsi3 (unsigned a, unsigned b)
+   {
+      return STEP (0, 0) +
+          ((STEP (SHIFT, 0) + STEP (0, SHIFT)) << SHIFT) +
+          ((STEP (0, 2 * SHIFT) + STEP (SHIFT, SHIFT) + STEP (2 * SHIFT, 0))
+           << (2 * SHIFT));
+   }
+*/
+        mov   r6, r14
+        movea lo(32767), r0, r10
+        and   r10, r14
+        mov   r7,  r15
+        and   r10, r15
+        shr   15,  r6
+        mov   r6,  r13
+        and   r10, r13
+        shr   15,  r7
+        mov   r7,  r12
+        and   r10, r12
+        shr   15,  r6
+        shr   15,  r7
+        mov   r14, r10
+        mulh  r15, r10
+        mov   r14, r11
+        mulh  r12, r11
+        mov   r13, r16
+        mulh  r15, r16
+        mulh  r14, r7
+        mulh  r15, r6
+        add   r16, r11
+        mulh  r13, r12
+        shl   15,  r11
+        add   r11, r10
+        add   r12, r7
+        add   r6,  r7
+        shl   30,  r7
+        add   r7,  r10
+        jmp   [r31]
+#endif /* __v850__ */
+#if defined(__v850e__) || defined(__v850ea__) || defined(__v850e2__) || defined(__v850e2v3__)
+        /* This routine is almost unneccesarry because gcc
+           generates the MUL instruction for the RTX mulsi3.
+           But if someone wants to link his application with
+           previsously compiled v850 objects then they will 
+	   need this function.  */
+ 
+        /* It isn't good to put the inst sequence as below;
+              mul r7, r6,
+              mov r6, r10, r0
+           In this case, there is a RAW hazard between them.
+           MUL inst takes 2 cycle in EX stage, then MOV inst
+           must wait 1cycle.  */
+        mov   r7, r10
+        mul   r6, r10, r0
+        jmp   [r31]
+#endif /* __v850e__ */
+	.size ___mulsi3,.-___mulsi3
+#endif /* L_mulsi3 */
+
+
+#ifdef L_udivsi3
+	.text
+	.global ___udivsi3
+	.type	___udivsi3,@function
+___udivsi3:
+#ifdef __v850__
+	mov 1,r12
+	mov 0,r10
+	cmp r6,r7
+	bnl .L12
+	movhi hi(-2147483648),r0,r13
+	cmp r0,r7
+	blt .L12
+.L4:
+	shl 1,r7
+	shl 1,r12
+	cmp r6,r7
+	bnl .L12
+	cmp r0,r12
+	be .L8
+	mov r7,r19
+	and r13,r19
+	be .L4
+	br .L12
+.L9:
+	cmp r7,r6
+	bl .L10
+	sub r7,r6
+	or r12,r10
+.L10:
+	shr 1,r12
+	shr 1,r7
+.L12:
+	cmp r0,r12
+	bne .L9
+.L8:
+	jmp [r31]
+
+#else /* defined(__v850e__) */
+
+	/* See comments at end of __mulsi3.  */
+	mov   r6, r10	
+	divu  r7, r10, r0
+	jmp   [r31]		
+
+#endif /* __v850e__ */
+
+	.size ___udivsi3,.-___udivsi3
+#endif
+
+#ifdef L_divsi3
+	.text
+	.globl ___divsi3
+	.type  ___divsi3,@function
+___divsi3:
+#ifdef __v850__
+	add -8,sp
+	st.w r31,4[sp]
+	st.w r22,0[sp]
+	mov 1,r22
+	tst r7,r7
+	bp .L3
+	subr r0,r7
+	subr r0,r22
+.L3:
+	tst r6,r6
+	bp .L4
+	subr r0,r6
+	subr r0,r22
+.L4:
+	jarl ___udivsi3,r31
+	cmp r0,r22
+	bp .L7
+	subr r0,r10
+.L7:
+	ld.w 0[sp],r22
+	ld.w 4[sp],r31
+	add 8,sp
+	jmp [r31]
+
+#else /* defined(__v850e__) */
+
+	/* See comments at end of __mulsi3.  */
+	mov   r6, r10
+	div   r7, r10, r0
+	jmp   [r31]
+
+#endif /* __v850e__ */
+
+	.size ___divsi3,.-___divsi3
+#endif
+
+#ifdef  L_umodsi3
+	.text
+	.globl ___umodsi3
+	.type  ___umodsi3,@function
+___umodsi3:
+#ifdef __v850__
+	add -12,sp
+	st.w r31,8[sp]
+	st.w r7,4[sp]
+	st.w r6,0[sp]
+	jarl ___udivsi3,r31
+	ld.w 4[sp],r7
+	mov r10,r6
+	jarl ___mulsi3,r31
+	ld.w 0[sp],r6
+	subr r6,r10
+	ld.w 8[sp],r31
+	add 12,sp
+	jmp [r31]
+
+#else /* defined(__v850e__) */
+
+	/* See comments at end of __mulsi3.  */
+	divu  r7, r6, r10
+	jmp   [r31]
+
+#endif /* __v850e__ */
+
+	.size ___umodsi3,.-___umodsi3
+#endif /* L_umodsi3 */
+
+#ifdef  L_modsi3
+	.text
+	.globl ___modsi3
+	.type  ___modsi3,@function
+___modsi3:
+#ifdef __v850__	
+	add -12,sp
+	st.w r31,8[sp]
+	st.w r7,4[sp]
+	st.w r6,0[sp]
+	jarl ___divsi3,r31
+	ld.w 4[sp],r7
+	mov r10,r6
+	jarl ___mulsi3,r31
+	ld.w 0[sp],r6
+	subr r6,r10
+	ld.w 8[sp],r31
+	add 12,sp
+	jmp [r31]
+
+#else /* defined(__v850e__) */
+
+	/* See comments at end of __mulsi3.  */
+	div  r7, r6, r10
+	jmp [r31]
+
+#endif /* __v850e__ */
+
+	.size ___modsi3,.-___modsi3
+#endif /* L_modsi3 */
+
+#ifdef	L_save_2
+	.text
+	.align	2
+	.globl	__save_r2_r29
+	.type	__save_r2_r29,@function
+	/* Allocate space and save registers 2, 20 .. 29 on the stack.  */
+	/* Called via:	jalr __save_r2_r29,r10.  */
+__save_r2_r29:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-44,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	sst.w	r22,28[ep]
+	sst.w	r21,32[ep]
+	sst.w	r20,36[ep]
+	sst.w	r2,40[ep]
+	mov	r1,ep
+#else
+	addi	-44,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+	st.w	r22,28[sp]
+	st.w	r21,32[sp]
+	st.w	r20,36[sp]
+	st.w	r2,40[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r2_r29,.-__save_r2_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r2_r29.  */
+	.align	2
+	.globl	__return_r2_r29
+	.type	__return_r2_r29,@function
+__return_r2_r29:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	sld.w	28[ep],r22
+	sld.w	32[ep],r21
+	sld.w	36[ep],r20
+	sld.w	40[ep],r2
+	addi	44,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r23
+	ld.w	28[sp],r22
+	ld.w	32[sp],r21
+	ld.w	36[sp],r20
+	ld.w	40[sp],r2
+	addi	44,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r2_r29,.-__return_r2_r29
+#endif /* L_save_2 */
+
+#ifdef	L_save_20
+	.text
+	.align	2
+	.globl	__save_r20_r29
+	.type	__save_r20_r29,@function
+	/* Allocate space and save registers 20 .. 29 on the stack.  */
+	/* Called via:	jalr __save_r20_r29,r10.  */
+__save_r20_r29:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-40,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	sst.w	r22,28[ep]
+	sst.w	r21,32[ep]
+	sst.w	r20,36[ep]
+	mov	r1,ep
+#else
+	addi	-40,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+	st.w	r22,28[sp]
+	st.w	r21,32[sp]
+	st.w	r20,36[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r20_r29,.-__save_r20_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r20_r29.  */
+	.align	2
+	.globl	__return_r20_r29
+	.type	__return_r20_r29,@function
+__return_r20_r29:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	sld.w	28[ep],r22
+	sld.w	32[ep],r21
+	sld.w	36[ep],r20
+	addi	40,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r23
+	ld.w	28[sp],r22
+	ld.w	32[sp],r21
+	ld.w	36[sp],r20
+	addi	40,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r20_r29,.-__return_r20_r29
+#endif /* L_save_20 */
+
+#ifdef	L_save_21
+	.text
+	.align	2
+	.globl	__save_r21_r29
+	.type	__save_r21_r29,@function
+	/* Allocate space and save registers 21 .. 29 on the stack.  */
+	/* Called via:	jalr __save_r21_r29,r10.  */
+__save_r21_r29:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-36,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	sst.w	r22,28[ep]
+	sst.w	r21,32[ep]
+	mov	r1,ep
+#else
+	addi	-36,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+	st.w	r22,28[sp]
+	st.w	r21,32[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r21_r29,.-__save_r21_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r21_r29.  */
+	.align	2
+	.globl	__return_r21_r29
+	.type	__return_r21_r29,@function
+__return_r21_r29:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	sld.w	28[ep],r22
+	sld.w	32[ep],r21
+	addi	36,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r23
+	ld.w	28[sp],r22
+	ld.w	32[sp],r21
+	addi	36,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r21_r29,.-__return_r21_r29
+#endif /* L_save_21 */
+
+#ifdef	L_save_22
+	.text
+	.align	2
+	.globl	__save_r22_r29
+	.type	__save_r22_r29,@function
+	/* Allocate space and save registers 22 .. 29 on the stack.  */
+	/* Called via:	jalr __save_r22_r29,r10.  */
+__save_r22_r29:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-32,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	sst.w	r22,28[ep]
+	mov	r1,ep
+#else
+	addi	-32,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+	st.w	r22,28[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r22_r29,.-__save_r22_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r22_r29.  */
+	.align	2
+	.globl	__return_r22_r29
+	.type	__return_r22_r29,@function
+__return_r22_r29:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	sld.w	28[ep],r22
+	addi	32,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r23
+	ld.w	28[sp],r22
+	addi	32,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r22_r29,.-__return_r22_r29
+#endif /* L_save_22 */
+
+#ifdef	L_save_23
+	.text
+	.align	2
+	.globl	__save_r23_r29
+	.type	__save_r23_r29,@function
+	/* Allocate space and save registers 23 .. 29 on the stack.  */
+	/* Called via:	jalr __save_r23_r29,r10.  */
+__save_r23_r29:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-28,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	mov	r1,ep
+#else
+	addi	-28,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r23_r29,.-__save_r23_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r23_r29.  */
+	.align	2
+	.globl	__return_r23_r29
+	.type	__return_r23_r29,@function
+__return_r23_r29:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	addi	28,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r23
+	addi	28,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r23_r29,.-__return_r23_r29
+#endif /* L_save_23 */
+
+#ifdef	L_save_24
+	.text
+	.align	2
+	.globl	__save_r24_r29
+	.type	__save_r24_r29,@function
+	/* Allocate space and save registers 24 .. 29 on the stack.  */
+	/* Called via:	jalr __save_r24_r29,r10.  */
+__save_r24_r29:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-24,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	mov	r1,ep
+#else
+	addi	-24,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r24_r29,.-__save_r24_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r24_r29.  */
+	.align	2
+	.globl	__return_r24_r29
+	.type	__return_r24_r29,@function
+__return_r24_r29:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	addi	24,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	addi	24,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r24_r29,.-__return_r24_r29
+#endif /* L_save_24 */
+
+#ifdef	L_save_25
+	.text
+	.align	2
+	.globl	__save_r25_r29
+	.type	__save_r25_r29,@function
+	/* Allocate space and save registers 25 .. 29 on the stack.  */
+	/* Called via:	jalr __save_r25_r29,r10.  */
+__save_r25_r29:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-20,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	mov	r1,ep
+#else
+	addi	-20,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r25_r29,.-__save_r25_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r25_r29.  */
+	.align	2
+	.globl	__return_r25_r29
+	.type	__return_r25_r29,@function
+__return_r25_r29:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	addi	20,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[ep],r29
+	ld.w	4[ep],r28
+	ld.w	8[ep],r27
+	ld.w	12[ep],r26
+	ld.w	16[ep],r25
+	addi	20,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r25_r29,.-__return_r25_r29
+#endif /* L_save_25 */
+
+#ifdef	L_save_26
+	.text
+	.align	2
+	.globl	__save_r26_r29
+	.type	__save_r26_r29,@function
+	/* Allocate space and save registers 26 .. 29 on the stack.  */
+	/* Called via:	jalr __save_r26_r29,r10.  */
+__save_r26_r29:
+#ifdef __EP__
+	mov	ep,r1
+	add	-16,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	mov	r1,ep
+#else
+	add	-16,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r26_r29,.-__save_r26_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r26_r29.  */
+	.align	2
+	.globl	__return_r26_r29
+	.type	__return_r26_r29,@function
+__return_r26_r29:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	addi	16,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	addi	16,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r26_r29,.-__return_r26_r29
+#endif /* L_save_26 */
+
+#ifdef	L_save_27
+	.text
+	.align	2
+	.globl	__save_r27_r29
+	.type	__save_r27_r29,@function
+	/* Allocate space and save registers 27 .. 29 on the stack.  */
+	/* Called via:	jalr __save_r27_r29,r10.  */
+__save_r27_r29:
+	add	-12,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	jmp	[r10]
+	.size	__save_r27_r29,.-__save_r27_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r27_r29.  */
+	.align	2
+	.globl	__return_r27_r29
+	.type	__return_r27_r29,@function
+__return_r27_r29:
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	add	12,sp
+	jmp	[r31]
+	.size	__return_r27_r29,.-__return_r27_r29
+#endif /* L_save_27 */
+
+#ifdef	L_save_28
+	.text
+	.align	2
+	.globl	__save_r28_r29
+	.type	__save_r28_r29,@function
+	/* Allocate space and save registers 28,29 on the stack.  */
+	/* Called via:	jalr __save_r28_r29,r10.  */
+__save_r28_r29:
+	add	-8,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	jmp	[r10]
+	.size	__save_r28_r29,.-__save_r28_r29
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r28_r29.  */
+	.align	2
+	.globl	__return_r28_r29
+	.type	__return_r28_r29,@function
+__return_r28_r29:
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	add	8,sp
+	jmp	[r31]
+	.size	__return_r28_r29,.-__return_r28_r29
+#endif /* L_save_28 */
+
+#ifdef	L_save_29
+	.text
+	.align	2
+	.globl	__save_r29
+	.type	__save_r29,@function
+	/* Allocate space and save register 29 on the stack.  */
+	/* Called via:	jalr __save_r29,r10.  */
+__save_r29:
+	add	-4,sp
+	st.w	r29,0[sp]
+	jmp	[r10]
+	.size	__save_r29,.-__save_r29
+
+	/* Restore saved register 29, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r29.  */
+	.align	2
+	.globl	__return_r29
+	.type	__return_r29,@function
+__return_r29:
+	ld.w	0[sp],r29
+	add	4,sp
+	jmp	[r31]
+	.size	__return_r29,.-__return_r29
+#endif /* L_save_28 */
+
+#ifdef	L_save_2c
+	.text
+	.align	2
+	.globl	__save_r2_r31
+	.type	__save_r2_r31,@function
+	/* Allocate space and save registers 20 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r2_r31,r10.  */
+__save_r2_r31:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-48,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	sst.w	r22,28[ep]
+	sst.w	r21,32[ep]
+	sst.w	r20,36[ep]
+	sst.w	r2,40[ep]
+	sst.w	r31,44[ep]
+	mov	r1,ep
+#else
+	addi	-48,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+	st.w	r22,28[sp]
+	st.w	r21,32[sp]
+	st.w	r20,36[sp]
+	st.w	r2,40[sp]
+	st.w	r31,44[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r2_r31,.-__save_r2_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r20_r31.  */
+	.align	2
+	.globl	__return_r2_r31
+	.type	__return_r2_r31,@function
+__return_r2_r31:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	sld.w	28[ep],r22
+	sld.w	32[ep],r21
+	sld.w	36[ep],r20
+	sld.w	40[ep],r2
+	sld.w	44[ep],r31
+	addi	48,sp,sp
+	mov	r1,ep
+#else
+	ld.w	44[sp],r29
+	ld.w	40[sp],r28
+	ld.w	36[sp],r27
+	ld.w	32[sp],r26
+	ld.w	28[sp],r25
+	ld.w	24[sp],r24
+	ld.w	20[sp],r23
+	ld.w	16[sp],r22
+	ld.w	12[sp],r21
+	ld.w	8[sp],r20
+	ld.w	4[sp],r2
+	ld.w	0[sp],r31
+	addi	48,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r2_r31,.-__return_r2_r31
+#endif /* L_save_2c */
+
+#ifdef	L_save_20c
+	.text
+	.align	2
+	.globl	__save_r20_r31
+	.type	__save_r20_r31,@function
+	/* Allocate space and save registers 20 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r20_r31,r10.  */
+__save_r20_r31:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-44,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	sst.w	r22,28[ep]
+	sst.w	r21,32[ep]
+	sst.w	r20,36[ep]
+	sst.w	r31,40[ep]
+	mov	r1,ep
+#else
+	addi	-44,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+	st.w	r22,28[sp]
+	st.w	r21,32[sp]
+	st.w	r20,36[sp]
+	st.w	r31,40[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r20_r31,.-__save_r20_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r20_r31.  */
+	.align	2
+	.globl	__return_r20_r31
+	.type	__return_r20_r31,@function
+__return_r20_r31:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	sld.w	28[ep],r22
+	sld.w	32[ep],r21
+	sld.w	36[ep],r20
+	sld.w	40[ep],r31
+	addi	44,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r23
+	ld.w	28[sp],r22
+	ld.w	32[sp],r21
+	ld.w	36[sp],r20
+	ld.w	40[sp],r31
+	addi	44,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r20_r31,.-__return_r20_r31
+#endif /* L_save_20c */
+
+#ifdef	L_save_21c
+	.text
+	.align	2
+	.globl	__save_r21_r31
+	.type	__save_r21_r31,@function
+	/* Allocate space and save registers 21 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r21_r31,r10.  */
+__save_r21_r31:
+#ifdef __EP__	
+	mov	ep,r1
+	addi	-40,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	sst.w	r22,28[ep]
+	sst.w	r21,32[ep]
+	sst.w	r31,36[ep]
+	mov	r1,ep
+	jmp	[r10]
+#else	
+	addi	-40,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+	st.w	r22,28[sp]
+	st.w	r21,32[sp]
+	st.w	r31,36[sp]
+	jmp	[r10]
+#endif	
+	.size	__save_r21_r31,.-__save_r21_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r21_r31.  */
+	.align	2
+	.globl	__return_r21_r31
+	.type	__return_r21_r31,@function
+__return_r21_r31:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	sld.w	28[ep],r22
+	sld.w	32[ep],r21
+	sld.w	36[ep],r31
+	addi	40,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r23
+	ld.w	28[sp],r22
+	ld.w	32[sp],r21
+	ld.w	36[sp],r31
+	addi	40,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r21_r31,.-__return_r21_r31
+#endif /* L_save_21c */
+
+#ifdef	L_save_22c
+	.text
+	.align	2
+	.globl	__save_r22_r31
+	.type	__save_r22_r31,@function
+	/* Allocate space and save registers 22 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r22_r31,r10.  */
+__save_r22_r31:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-36,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	sst.w	r22,28[ep]
+	sst.w	r31,32[ep]
+	mov	r1,ep
+#else
+	addi	-36,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+	st.w	r22,28[sp]
+	st.w	r31,32[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r22_r31,.-__save_r22_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r22_r31.  */
+	.align	2
+	.globl	__return_r22_r31
+	.type	__return_r22_r31,@function
+__return_r22_r31:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	sld.w	28[ep],r22
+	sld.w	32[ep],r31
+	addi	36,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r23
+	ld.w	28[sp],r22
+	ld.w	32[sp],r31
+	addi	36,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r22_r31,.-__return_r22_r31
+#endif /* L_save_22c */
+
+#ifdef	L_save_23c
+	.text
+	.align	2
+	.globl	__save_r23_r31
+	.type	__save_r23_r31,@function
+	/* Allocate space and save registers 23 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r23_r31,r10.  */
+__save_r23_r31:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-32,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r23,24[ep]
+	sst.w	r31,28[ep]
+	mov	r1,ep
+#else
+	addi	-32,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r23,24[sp]
+	st.w	r31,28[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r23_r31,.-__save_r23_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r23_r31.  */
+	.align	2
+	.globl	__return_r23_r31
+	.type	__return_r23_r31,@function
+__return_r23_r31:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r23
+	sld.w	28[ep],r31
+	addi	32,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r23
+	ld.w	28[sp],r31
+	addi	32,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r23_r31,.-__return_r23_r31
+#endif /* L_save_23c */
+
+#ifdef	L_save_24c
+	.text
+	.align	2
+	.globl	__save_r24_r31
+	.type	__save_r24_r31,@function
+	/* Allocate space and save registers 24 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r24_r31,r10.  */
+__save_r24_r31:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-28,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r24,20[ep]
+	sst.w	r31,24[ep]
+	mov	r1,ep
+#else
+	addi	-28,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r24,20[sp]
+	st.w	r31,24[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r24_r31,.-__save_r24_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r24_r31.  */
+	.align	2
+	.globl	__return_r24_r31
+	.type	__return_r24_r31,@function
+__return_r24_r31:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r24
+	sld.w	24[ep],r31
+	addi	28,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r24
+	ld.w	24[sp],r31
+	addi	28,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r24_r31,.-__return_r24_r31
+#endif /* L_save_24c */
+
+#ifdef	L_save_25c
+	.text
+	.align	2
+	.globl	__save_r25_r31
+	.type	__save_r25_r31,@function
+	/* Allocate space and save registers 25 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r25_r31,r10.  */
+__save_r25_r31:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-24,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r25,16[ep]
+	sst.w	r31,20[ep]
+	mov	r1,ep
+#else
+	addi	-24,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r25,16[sp]
+	st.w	r31,20[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r25_r31,.-__save_r25_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r25_r31.  */
+	.align	2
+	.globl	__return_r25_r31
+	.type	__return_r25_r31,@function
+__return_r25_r31:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r25
+	sld.w	20[ep],r31
+	addi	24,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r25
+	ld.w	20[sp],r31
+	addi	24,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r25_r31,.-__return_r25_r31
+#endif /* L_save_25c */
+
+#ifdef	L_save_26c
+	.text
+	.align	2
+	.globl	__save_r26_r31
+	.type	__save_r26_r31,@function
+	/* Allocate space and save registers 26 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r26_r31,r10.  */
+__save_r26_r31:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-20,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r26,12[ep]
+	sst.w	r31,16[ep]
+	mov	r1,ep
+#else
+	addi	-20,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r26,12[sp]
+	st.w	r31,16[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r26_r31,.-__save_r26_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r26_r31.  */
+	.align	2
+	.globl	__return_r26_r31
+	.type	__return_r26_r31,@function
+__return_r26_r31:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r26
+	sld.w	16[ep],r31
+	addi	20,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r26
+	ld.w	16[sp],r31
+	addi	20,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r26_r31,.-__return_r26_r31
+#endif /* L_save_26c */
+
+#ifdef	L_save_27c
+	.text
+	.align	2
+	.globl	__save_r27_r31
+	.type	__save_r27_r31,@function
+	/* Allocate space and save registers 27 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r27_r31,r10.  */
+__save_r27_r31:
+#ifdef __EP__
+	mov	ep,r1
+	addi	-16,sp,sp
+	mov	sp,ep
+	sst.w	r29,0[ep]
+	sst.w	r28,4[ep]
+	sst.w	r27,8[ep]
+	sst.w	r31,12[ep]
+	mov	r1,ep
+#else
+	addi	-16,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r27,8[sp]
+	st.w	r31,12[sp]
+#endif
+	jmp	[r10]
+	.size	__save_r27_r31,.-__save_r27_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r27_r31.  */
+	.align	2
+	.globl	__return_r27_r31
+	.type	__return_r27_r31,@function
+__return_r27_r31:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	0[ep],r29
+	sld.w	4[ep],r28
+	sld.w	8[ep],r27
+	sld.w	12[ep],r31
+	addi	16,sp,sp
+	mov	r1,ep
+#else
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r27
+	ld.w	12[sp],r31
+	addi	16,sp,sp
+#endif
+	jmp	[r31]
+	.size	__return_r27_r31,.-__return_r27_r31
+#endif /* L_save_27c */
+
+#ifdef	L_save_28c
+	.text
+	.align	2
+	.globl	__save_r28_r31
+	.type	__save_r28_r31,@function
+	/* Allocate space and save registers 28 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r28_r31,r10.  */
+__save_r28_r31:
+	addi	-12,sp,sp
+	st.w	r29,0[sp]
+	st.w	r28,4[sp]
+	st.w	r31,8[sp]
+	jmp	[r10]
+	.size	__save_r28_r31,.-__save_r28_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r28_r31.  */
+	.align	2
+	.globl	__return_r28_r31
+	.type	__return_r28_r31,@function
+__return_r28_r31:
+	ld.w	0[sp],r29
+	ld.w	4[sp],r28
+	ld.w	8[sp],r31
+	addi	12,sp,sp
+	jmp	[r31]
+	.size	__return_r28_r31,.-__return_r28_r31
+#endif /* L_save_28c */
+
+#ifdef	L_save_29c
+	.text
+	.align	2
+	.globl	__save_r29_r31
+	.type	__save_r29_r31,@function
+	/* Allocate space and save registers 29 & 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r29_r31,r10.  */
+__save_r29_r31:
+	addi	-8,sp,sp
+	st.w	r29,0[sp]
+	st.w	r31,4[sp]
+	jmp	[r10]
+	.size	__save_r29_r31,.-__save_r29_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r29_r31.  */
+	.align	2
+	.globl	__return_r29_r31
+	.type	__return_r29_r31,@function
+__return_r29_r31:
+	ld.w	0[sp],r29
+	ld.w	4[sp],r31
+	addi	8,sp,sp
+	jmp	[r31]
+	.size	__return_r29_r31,.-__return_r29_r31
+#endif /* L_save_29c */
+
+#ifdef	L_save_31c
+	.text
+	.align	2
+	.globl	__save_r31
+	.type	__save_r31,@function
+	/* Allocate space and save register 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	jalr __save_r31,r10.  */
+__save_r31:
+	addi	-4,sp,sp
+	st.w	r31,0[sp]
+	jmp	[r10]
+	.size	__save_r31,.-__save_r31
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	jr __return_r31.  */
+	.align	2
+	.globl	__return_r31
+	.type	__return_r31,@function
+__return_r31:
+	ld.w	0[sp],r31
+	addi	4,sp,sp
+	jmp	[r31]
+        .size   __return_r31,.-__return_r31
+#endif /* L_save_31c */
+
+#ifdef	L_save_interrupt
+	.text
+	.align	2
+	.globl	__save_interrupt
+	.type	__save_interrupt,@function
+	/* Save registers r1, r4 on stack and load up with expected values.  */
+	/* Note, 20 bytes of stack have already been allocated.  */
+	/* Called via:	jalr __save_interrupt,r10.  */
+__save_interrupt:
+       /* add -20,sp ; st.w r11,16[sp] ; st.w r10,12[sp] ; */
+	st.w	ep,0[sp]
+	st.w	gp,4[sp]
+	st.w	r1,8[sp]
+	movhi	hi(__ep),r0,ep
+	movea	lo(__ep),ep,ep
+	movhi	hi(__gp),r0,gp
+	movea	lo(__gp),gp,gp
+	jmp	[r10]
+	.size	__save_interrupt,.-__save_interrupt
+
+	/* Restore saved registers, deallocate stack and return from the interrupt.  */
+	/* Called via:	jr __return_interrupt.  */
+	.align	2
+	.globl	__return_interrupt
+	.type	__return_interrupt,@function
+__return_interrupt:
+	ld.w	0[sp],ep
+	ld.w	4[sp],gp
+	ld.w	8[sp],r1
+	ld.w	12[sp],r10
+	ld.w    16[sp],r11
+	addi    20,sp,sp
+	reti
+	.size	__return_interrupt,.-__return_interrupt
+#endif /* L_save_interrupt */
+
+#ifdef L_save_all_interrupt
+	.text
+	.align	2
+	.globl	__save_all_interrupt
+	.type	__save_all_interrupt,@function
+	/* Save all registers except for those saved in __save_interrupt.  */
+	/* Allocate enough stack for all of the registers & 16 bytes of space.  */
+	/* Called via:	jalr __save_all_interrupt,r10.  */
+__save_all_interrupt:
+	addi	-104,sp,sp
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sst.w	r31,100[ep]
+	sst.w	r2,96[ep]
+	sst.w	gp,92[ep]
+	sst.w	r6,88[ep]
+	sst.w	r7,84[ep]
+	sst.w	r8,80[ep]
+	sst.w	r9,76[ep]
+	sst.w	r11,72[ep]
+	sst.w	r12,68[ep]
+	sst.w	r13,64[ep]
+	sst.w	r14,60[ep]
+	sst.w	r15,56[ep]
+	sst.w	r16,52[ep]
+	sst.w	r17,48[ep]
+	sst.w	r18,44[ep]
+	sst.w	r19,40[ep]
+	sst.w	r20,36[ep]
+	sst.w	r21,32[ep]
+	sst.w	r22,28[ep]
+	sst.w	r23,24[ep]
+	sst.w	r24,20[ep]
+	sst.w	r25,16[ep]
+	sst.w	r26,12[ep]
+	sst.w	r27,8[ep]
+	sst.w	r28,4[ep]
+	sst.w	r29,0[ep]
+	mov	r1,ep
+#else
+	st.w	r31,100[sp]
+	st.w	r2,96[sp]
+	st.w	gp,92[sp]
+	st.w	r6,88[sp]
+	st.w	r7,84[sp]
+	st.w	r8,80[sp]
+	st.w	r9,76[sp]
+	st.w	r11,72[sp]
+	st.w	r12,68[sp]
+	st.w	r13,64[sp]
+	st.w	r14,60[sp]
+	st.w	r15,56[sp]
+	st.w	r16,52[sp]
+	st.w	r17,48[sp]
+	st.w	r18,44[sp]
+	st.w	r19,40[sp]
+	st.w	r20,36[sp]
+	st.w	r21,32[sp]
+	st.w	r22,28[sp]
+	st.w	r23,24[sp]
+	st.w	r24,20[sp]
+	st.w	r25,16[sp]
+	st.w	r26,12[sp]
+	st.w	r27,8[sp]
+	st.w	r28,4[sp]
+	st.w	r29,0[sp]
+#endif
+	jmp	[r10]
+	.size	__save_all_interrupt,.-__save_all_interrupt
+
+	.globl	__restore_all_interrupt
+	.type	__restore_all_interrupt,@function
+	/* Restore all registers saved in __save_all_interrupt and
+	   deallocate the stack space.  */
+	/* Called via:	jalr __restore_all_interrupt,r10.  */
+__restore_all_interrupt:
+#ifdef __EP__
+	mov	ep,r1
+	mov	sp,ep
+	sld.w	100[ep],r31
+	sld.w	96[ep],r2
+	sld.w	92[ep],gp
+	sld.w	88[ep],r6
+	sld.w	84[ep],r7
+	sld.w	80[ep],r8
+	sld.w	76[ep],r9
+	sld.w	72[ep],r11
+	sld.w	68[ep],r12
+	sld.w	64[ep],r13
+	sld.w	60[ep],r14
+	sld.w	56[ep],r15
+	sld.w	52[ep],r16
+	sld.w	48[ep],r17
+	sld.w	44[ep],r18
+	sld.w	40[ep],r19
+	sld.w	36[ep],r20
+	sld.w	32[ep],r21
+	sld.w	28[ep],r22
+	sld.w	24[ep],r23
+	sld.w	20[ep],r24
+	sld.w	16[ep],r25
+	sld.w	12[ep],r26
+	sld.w	8[ep],r27
+	sld.w	4[ep],r28
+	sld.w	0[ep],r29
+	mov	r1,ep
+#else
+	ld.w	100[sp],r31
+	ld.w	96[sp],r2
+	ld.w	92[sp],gp
+	ld.w	88[sp],r6
+	ld.w	84[sp],r7
+	ld.w	80[sp],r8
+	ld.w	76[sp],r9
+	ld.w	72[sp],r11
+	ld.w	68[sp],r12
+	ld.w	64[sp],r13
+	ld.w	60[sp],r14
+	ld.w	56[sp],r15
+	ld.w	52[sp],r16
+	ld.w	48[sp],r17
+	ld.w	44[sp],r18
+	ld.w	40[sp],r19
+	ld.w	36[sp],r20
+	ld.w	32[sp],r21
+	ld.w	28[sp],r22
+	ld.w	24[sp],r23
+	ld.w	20[sp],r24
+	ld.w	16[sp],r25
+	ld.w	12[sp],r26
+	ld.w	8[sp],r27
+	ld.w	4[sp],r28
+	ld.w	0[sp],r29
+#endif
+	addi	104,sp,sp	
+	jmp	[r10]
+	.size	__restore_all_interrupt,.-__restore_all_interrupt
+#endif /* L_save_all_interrupt */
+	
+#if defined(__v850e__) || defined(__v850e1__) || defined(__v850e2__) || defined(__v850e2v3__)
+#ifdef	L_callt_save_r2_r29
+	/* Put these functions into the call table area.  */
+	.call_table_text
+	
+	/* Allocate space and save registers 2, 20 .. 29 on the stack.  */
+	/* Called via:	callt ctoff(__callt_save_r2_r29).  */
+	.align	2
+.L_save_r2_r29:
+	add	-4, sp
+	st.w	r2, 0[sp]
+	prepare {r20 - r29}, 0
+	ctret
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	callt ctoff(__callt_return_r2_r29).  */
+	.align	2
+.L_return_r2_r29:
+	dispose 0, {r20-r29}
+	ld.w    0[sp], r2
+	add	4, sp
+	jmp     [r31]
+
+	/* Place the offsets of the start of these routines into the call table.  */
+	.call_table_data
+
+	.global	__callt_save_r2_r29
+	.type	__callt_save_r2_r29,@function
+__callt_save_r2_r29:	.short ctoff(.L_save_r2_r29)
+	
+	.global	__callt_return_r2_r29
+	.type	__callt_return_r2_r29,@function
+__callt_return_r2_r29:	.short ctoff(.L_return_r2_r29)
+	
+#endif /* L_callt_save_r2_r29.  */
+
+#ifdef	L_callt_save_r2_r31
+	/* Put these functions into the call table area.  */
+	.call_table_text
+	
+	/* Allocate space and save registers 2 and 20 .. 29, 31 on the stack.  */
+	/* Also allocate space for the argument save area.  */
+	/* Called via:	callt ctoff(__callt_save_r2_r31).  */
+	.align	2
+.L_save_r2_r31:
+	add	-4, sp
+	st.w	r2, 0[sp]
+	prepare {r20 - r29, r31}, 0
+	ctret
+
+	/* Restore saved registers, deallocate stack and return to the user.  */
+	/* Called via:	callt ctoff(__callt_return_r2_r31).  */
+	.align	2
+.L_return_r2_r31:
+	dispose 0, {r20 - r29, r31}
+	ld.w    0[sp], r2
+	addi	4, sp, sp
+	jmp     [r31]
+
+	/* Place the offsets of the start of these routines into the call table.  */
+	.call_table_data
+
+	.global	__callt_save_r2_r31
+	.type	__callt_save_r2_r31,@function
+__callt_save_r2_r31:	.short ctoff(.L_save_r2_r31)
+	
+	.global	__callt_return_r2_r31
+	.type	__callt_return_r2_r31,@function
+__callt_return_r2_r31:	.short ctoff(.L_return_r2_r31)
+	
+#endif /* L_callt_save_r2_r31 */
+
+#ifdef	L_callt_save_interrupt
+	/* Put these functions into the call table area.  */
+	.call_table_text
+	
+	/* Save registers r1, ep, gp, r10 on stack and load up with expected values.  */
+	/* Called via:	callt ctoff(__callt_save_interrupt).  */
+	.align	2
+.L_save_interrupt:
+        /* SP has already been moved before callt ctoff(_save_interrupt).  */
+        /* R1,R10,R11,ctpc,ctpsw has alread been saved bofore callt ctoff(_save_interrupt).  */
+        /* addi -28, sp, sp  */
+        /* st.w r1,    24[sp] */
+        /* st.w r10,   12[sp] */
+        /* st.w r11,   16[sp] */
+        /* stsr ctpc,  r10    */
+        /* st.w r10,   20[sp] */
+        /* stsr ctpsw, r10    */
+        /* st.w r10,   24[sp] */
+        st.w    ep,  0[sp]
+        st.w    gp,  4[sp]
+        st.w    r1,  8[sp]
+	mov	hilo(__ep),ep
+	mov	hilo(__gp),gp
+	ctret
+
+        .call_table_text
+	/* Restore saved registers, deallocate stack and return from the interrupt.  */
+        /* Called via:  callt ctoff(__callt_restore_interrupt).  */
+	.align	2
+	.globl	__return_interrupt
+	.type	__return_interrupt,@function
+.L_return_interrupt:
+        ld.w    24[sp], r1
+        ldsr    r1,     ctpsw
+        ld.w    20[sp], r1
+        ldsr    r1,     ctpc
+        ld.w    16[sp], r11
+        ld.w    12[sp], r10
+        ld.w     8[sp], r1
+        ld.w     4[sp], gp
+        ld.w     0[sp], ep
+        addi    28, sp, sp
+        reti
+
+	/* Place the offsets of the start of these routines into the call table.  */
+	.call_table_data
+
+        .global __callt_save_interrupt
+        .type   __callt_save_interrupt,@function
+__callt_save_interrupt:         .short ctoff(.L_save_interrupt)
+
+        .global __callt_return_interrupt
+        .type   __callt_return_interrupt,@function
+__callt_return_interrupt:       .short ctoff(.L_return_interrupt)
+	
+#endif /* L_callt_save_interrupt */
+
+#ifdef L_callt_save_all_interrupt
+	/* Put these functions into the call table area.  */
+	.call_table_text
+	
+	/* Save all registers except for those saved in __save_interrupt.  */
+	/* Allocate enough stack for all of the registers & 16 bytes of space.  */
+	/* Called via:	callt ctoff(__callt_save_all_interrupt).  */
+	.align	2
+.L_save_all_interrupt:
+	addi	-60, sp, sp
+#ifdef __EP__
+	mov	ep,  r1
+	mov	sp,  ep
+	sst.w	r2,  56[ep]
+	sst.w	r5,  52[ep]
+	sst.w	r6,  48[ep]
+	sst.w	r7,  44[ep]
+	sst.w	r8,  40[ep]
+	sst.w	r9,  36[ep]
+	sst.w	r11, 32[ep]
+	sst.w	r12, 28[ep]
+	sst.w	r13, 24[ep]
+	sst.w	r14, 20[ep]
+	sst.w	r15, 16[ep]
+	sst.w	r16, 12[ep]
+	sst.w	r17, 8[ep]
+	sst.w	r18, 4[ep]
+	sst.w	r19, 0[ep]
+	mov	r1,  ep
+#else
+	st.w	r2,  56[sp]
+	st.w	r5,  52[sp]
+	st.w	r6,  48[sp]
+	st.w	r7,  44[sp]
+	st.w	r8,  40[sp]
+	st.w	r9,  36[sp]
+	st.w	r11, 32[sp]
+	st.w	r12, 28[sp]
+	st.w	r13, 24[sp]
+	st.w	r14, 20[sp]
+	st.w	r15, 16[sp]
+	st.w	r16, 12[sp]
+	st.w	r17, 8[sp]
+	st.w	r18, 4[sp]
+	st.w	r19, 0[sp]
+#endif
+	prepare {r20 - r29, r31}, 0
+	ctret	
+
+	/* Restore all registers saved in __save_all_interrupt
+	   deallocate the stack space.  */
+	/* Called via:	callt ctoff(__callt_restore_all_interrupt).  */
+	.align 2
+.L_restore_all_interrupt:
+	dispose 0, {r20 - r29, r31}
+#ifdef __EP__
+	mov	ep, r1
+	mov	sp, ep
+	sld.w	0 [ep], r19
+	sld.w	4 [ep], r18
+	sld.w	8 [ep], r17
+	sld.w	12[ep], r16
+	sld.w	16[ep], r15
+	sld.w	20[ep], r14
+	sld.w	24[ep], r13
+	sld.w	28[ep], r12
+	sld.w	32[ep], r11
+	sld.w	36[ep], r9
+	sld.w	40[ep], r8
+	sld.w	44[ep], r7
+	sld.w	48[ep], r6
+	sld.w	52[ep], r5
+	sld.w	56[ep], r2
+	mov	r1, ep
+#else
+	ld.w	0 [sp], r19
+	ld.w	4 [sp], r18
+	ld.w	8 [sp], r17
+	ld.w	12[sp], r16
+	ld.w	16[sp], r15
+	ld.w	20[sp], r14
+	ld.w	24[sp], r13
+	ld.w	28[sp], r12
+	ld.w	32[sp], r11
+	ld.w	36[sp], r9
+	ld.w	40[sp], r8
+	ld.w	44[sp], r7
+	ld.w	48[sp], r6
+	ld.w	52[sp], r5
+	ld.w	56[sp], r2
+#endif
+	addi	60, sp, sp
+	ctret
+
+	/* Place the offsets of the start of these routines into the call table.  */
+	.call_table_data
+
+	.global	__callt_save_all_interrupt
+	.type	__callt_save_all_interrupt,@function
+__callt_save_all_interrupt:	.short ctoff(.L_save_all_interrupt)
+	
+	.global	__callt_restore_all_interrupt
+	.type	__callt_restore_all_interrupt,@function
+__callt_restore_all_interrupt:	.short ctoff(.L_restore_all_interrupt)
+	
+#endif /* L_callt_save_all_interrupt */
+
+
+#define MAKE_CALLT_FUNCS( START )						\
+	.call_table_text							;\
+	.align	2								;\
+	/* Allocate space and save registers START .. r29 on the stack.  */	;\
+	/* Called via:	callt ctoff(__callt_save_START_r29).  */		;\
+.L_save_##START##_r29:								;\
+	prepare { START - r29 }, 0						;\
+	ctret									;\
+										;\
+	/* Restore saved registers, deallocate stack and return.  */		;\
+	/* Called via:	callt ctoff(__return_START_r29).  */			;\
+	.align	2								;\
+.L_return_##START##_r29:							;\
+	dispose 0, { START - r29 }, r31						;\
+										;\
+	/* Place the offsets of the start of these funcs into the call table.  */;\
+	.call_table_data							;\
+										;\
+	.global	__callt_save_##START##_r29					;\
+	.type	__callt_save_##START##_r29,@function				;\
+__callt_save_##START##_r29:	.short ctoff(.L_save_##START##_r29 )		;\
+										;\
+	.global	__callt_return_##START##_r29					;\
+	.type	__callt_return_##START##_r29,@function				;\
+__callt_return_##START##_r29:	.short ctoff(.L_return_##START##_r29 )	
+
+
+#define MAKE_CALLT_CFUNCS( START )						\
+	.call_table_text							;\
+	.align	2								;\
+	/* Allocate space and save registers START .. r31 on the stack.  */	;\
+	/* Called via:	callt ctoff(__callt_save_START_r31c).  */		;\
+.L_save_##START##_r31c:								;\
+	prepare { START - r29, r31}, 0						;\
+	ctret									;\
+										;\
+	/* Restore saved registers, deallocate stack and return.  */		;\
+	/* Called via:	callt ctoff(__return_START_r31c).  */			;\
+	.align	2								;\
+.L_return_##START##_r31c:							;\
+	dispose 0, { START - r29, r31}, r31					;\
+										;\
+	/* Place the offsets of the start of these funcs into the call table.  */;\
+	.call_table_data							;\
+										;\
+	.global	__callt_save_##START##_r31c					;\
+	.type	__callt_save_##START##_r31c,@function				;\
+__callt_save_##START##_r31c:    .short ctoff(.L_save_##START##_r31c )		;\
+										;\
+	.global	__callt_return_##START##_r31c					;\
+	.type	__callt_return_##START##_r31c,@function				;\
+__callt_return_##START##_r31c:  .short ctoff(.L_return_##START##_r31c )	
+
+	
+#ifdef	L_callt_save_20
+	MAKE_CALLT_FUNCS (r20)
+#endif
+#ifdef	L_callt_save_21
+	MAKE_CALLT_FUNCS (r21)
+#endif
+#ifdef	L_callt_save_22
+	MAKE_CALLT_FUNCS (r22)
+#endif
+#ifdef	L_callt_save_23
+	MAKE_CALLT_FUNCS (r23)
+#endif
+#ifdef	L_callt_save_24
+	MAKE_CALLT_FUNCS (r24)
+#endif
+#ifdef	L_callt_save_25
+	MAKE_CALLT_FUNCS (r25)
+#endif
+#ifdef	L_callt_save_26
+	MAKE_CALLT_FUNCS (r26)
+#endif
+#ifdef	L_callt_save_27
+	MAKE_CALLT_FUNCS (r27)
+#endif
+#ifdef	L_callt_save_28
+	MAKE_CALLT_FUNCS (r28)
+#endif
+#ifdef	L_callt_save_29
+	MAKE_CALLT_FUNCS (r29)
+#endif
+
+#ifdef	L_callt_save_20c
+	MAKE_CALLT_CFUNCS (r20)
+#endif
+#ifdef	L_callt_save_21c
+	MAKE_CALLT_CFUNCS (r21)
+#endif
+#ifdef	L_callt_save_22c
+	MAKE_CALLT_CFUNCS (r22)
+#endif
+#ifdef	L_callt_save_23c
+	MAKE_CALLT_CFUNCS (r23)
+#endif
+#ifdef	L_callt_save_24c
+	MAKE_CALLT_CFUNCS (r24)
+#endif
+#ifdef	L_callt_save_25c
+	MAKE_CALLT_CFUNCS (r25)
+#endif
+#ifdef	L_callt_save_26c
+	MAKE_CALLT_CFUNCS (r26)
+#endif
+#ifdef	L_callt_save_27c
+	MAKE_CALLT_CFUNCS (r27)
+#endif
+#ifdef	L_callt_save_28c
+	MAKE_CALLT_CFUNCS (r28)
+#endif
+#ifdef	L_callt_save_29c
+	MAKE_CALLT_CFUNCS (r29)
+#endif
+
+	
+#ifdef	L_callt_save_31c
+	.call_table_text
+	.align	2
+	/* Allocate space and save register r31 on the stack.  */
+	/* Called via:	callt ctoff(__callt_save_r31c).  */
+.L_callt_save_r31c:
+	prepare {r31}, 0
+	ctret
+
+	/* Restore saved registers, deallocate stack and return.  */
+	/* Called via:	callt ctoff(__return_r31c).  */
+	.align	2
+.L_callt_return_r31c:
+	dispose 0, {r31}, r31
+	
+	/* Place the offsets of the start of these funcs into the call table.  */
+	.call_table_data
+
+	.global	__callt_save_r31c
+	.type	__callt_save_r31c,@function
+__callt_save_r31c:	.short ctoff(.L_callt_save_r31c)
+
+	.global	__callt_return_r31c
+	.type	__callt_return_r31c,@function
+__callt_return_r31c:	.short ctoff(.L_callt_return_r31c)		
+#endif
+
+#endif /* __v850e__ */
+
+/*  libgcc2 routines for NEC V850.  */
+/*  Double Integer Arithmetical Operation.  */
+
+#ifdef L_negdi2
+	.text
+	.global ___negdi2
+	.type   ___negdi2, @function
+___negdi2:
+	not	r6, r10
+	add	1,  r10
+	setf	l,  r6
+	not	r7, r11
+	add	r6, r11
+	jmp	[lp]
+
+	.size ___negdi2,.-___negdi2
+#endif
+
+#ifdef L_cmpdi2
+	.text
+	.global ___cmpdi2
+	.type	___cmpdi2,@function
+___cmpdi2:
+	# Signed comparison bitween each high word.
+	cmp	r9, r7
+	be	.L_cmpdi_cmp_low
+	setf	ge, r10
+	setf	gt, r6
+	add	r6, r10
+	jmp	[lp]
+.L_cmpdi_cmp_low:
+	# Unsigned comparigon bitween each low word.
+	cmp     r8, r6
+	setf	nl, r10
+	setf	h,  r6
+	add	r6, r10
+	jmp	[lp]	
+	.size ___cmpdi2, . - ___cmpdi2	
+#endif
+
+#ifdef L_ucmpdi2
+	.text
+	.global ___ucmpdi2
+	.type	___ucmpdi2,@function
+___ucmpdi2:
+	cmp	r9, r7  # Check if each high word are same.
+	bne	.L_ucmpdi_check_psw
+	cmp     r8, r6  # Compare the word.
+.L_ucmpdi_check_psw:
+	setf	nl, r10 # 
+	setf	h,  r6  # 
+	add	r6, r10 # Add the result of comparison NL and comparison H.
+	jmp	[lp]	
+	.size ___ucmpdi2, . - ___ucmpdi2
+#endif
+
+#ifdef L_muldi3
+	.text
+	.global ___muldi3
+	.type	___muldi3,@function
+___muldi3:
+#ifdef __v850__
+        jarl  __save_r26_r31, r10
+        addi  16,  sp, sp
+        mov   r6,  r28
+        shr   15,  r28
+        movea lo(32767), r0, r14
+        and   r14, r28
+        mov   r8,  r10
+        shr   15,  r10
+        and   r14, r10
+        mov   r6,  r19
+        shr   30,  r19
+        mov   r7,  r12
+        shl   2,   r12
+        or    r12, r19
+        and   r14, r19
+        mov   r8,  r13
+        shr   30,  r13
+        mov   r9,  r12
+        shl   2,   r12
+        or    r12, r13
+        and   r14, r13
+        mov   r7,  r11
+        shr   13,  r11
+        and   r14, r11
+        mov   r9,  r31
+        shr   13,  r31
+        and   r14, r31
+        mov   r7,  r29
+        shr   28,  r29
+        and   r14, r29
+        mov   r9,  r12
+        shr   28,  r12
+        and   r14, r12
+        and   r14, r6
+        and   r14, r8
+        mov   r6,  r14
+        mulh  r8,  r14
+        mov   r6,  r16
+        mulh  r10, r16
+        mov   r6,  r18
+        mulh  r13, r18
+        mov   r6,  r15
+        mulh  r31, r15
+        mulh  r12, r6
+        mov   r28,  r17
+        mulh  r10, r17
+        add   -16, sp
+        mov   r28,  r12
+        mulh  r8,  r12
+        add   r17, r18
+        mov   r28,  r17
+        mulh  r31, r17
+        add   r12, r16
+        mov   r28,  r12
+        mulh  r13, r12
+        add   r17, r6
+        mov   r19, r17
+        add   r12, r15
+        mov   r19, r12
+        mulh  r8,  r12
+        mulh  r10, r17
+        add   r12, r18
+        mov   r19, r12
+        mulh  r13, r12
+        add   r17, r15
+        mov   r11, r13
+        mulh  r8,  r13
+        add   r12, r6
+        mov   r11, r12
+        mulh  r10, r12
+        add   r13, r15
+        mulh  r29, r8
+        add   r12, r6
+        mov   r16, r13
+        shl   15,  r13
+        add   r14, r13
+        mov   r18, r12
+        shl   30,  r12
+        mov   r13, r26
+        add   r12, r26
+        shr   15,  r14
+        movhi hi(131071), r0,  r12
+        movea lo(131071), r12, r13
+        and   r13, r14
+        mov   r16, r12
+        and   r13, r12
+        add   r12, r14
+        mov   r18, r12
+        shl   15,  r12
+        and   r13, r12
+        add   r12, r14
+        shr   17,  r14
+        shr   17,  r16
+        add   r14, r16
+        shl   13,  r15
+        shr   2,   r18
+        add   r18, r15
+        add   r15, r16
+        mov   r16, r27
+        add   r8,  r6
+        shl   28,  r6
+        add   r6,  r27
+        mov   r26, r10
+        mov   r27, r11
+        jr    __return_r26_r31
+#else /* defined(__v850e__) */
+	/*  (Ahi << 32 + Alo) * (Bhi << 32 + Blo) */
+	/*   r7           r6      r9         r8   */
+	mov  r8, r10
+	mulu r7, r8,  r0		/* Ahi * Blo */
+	mulu r6, r9,  r0		/* Alo * Bhi */
+	mulu r6, r10, r11		/* Alo * Blo */
+	add  r8, r11
+	add  r9, r11
+	jmp  [r31]
+#endif /* defined(__v850e__) */
+	.size ___muldi3, . - ___muldi3
+#endif
+	
diff --git a/libgcc/config/v850/t-v850 b/libgcc/config/v850/t-v850
new file mode 100644
index 00000000000..b61703ace09
--- /dev/null
+++ b/libgcc/config/v850/t-v850
@@ -0,0 +1,60 @@
+LIB1ASMSRC = v850/lib1funcs.S
+LIB1ASMFUNCS	= _mulsi3 \
+		  _divsi3 \
+		  _udivsi3 \
+		  _modsi3 \
+		  _umodsi3 \
+		  _save_2 \
+		  _save_20 \
+		  _save_21 \
+		  _save_22 \
+		  _save_23 \
+		  _save_24 \
+		  _save_25 \
+		  _save_26 \
+		  _save_27 \
+		  _save_28 \
+		  _save_29 \
+		  _save_2c \
+		  _save_20c \
+		  _save_21c \
+		  _save_22c \
+		  _save_23c \
+		  _save_24c \
+		  _save_25c \
+		  _save_26c \
+		  _save_27c \
+		  _save_28c \
+		  _save_29c \
+		  _save_31c \
+		  _save_interrupt \
+		  _save_all_interrupt \
+                  _callt_save_20 \
+		  _callt_save_21 \
+		  _callt_save_22 \
+		  _callt_save_23 \
+		  _callt_save_24 \
+		  _callt_save_25 \
+		  _callt_save_26 \
+		  _callt_save_27 \
+		  _callt_save_28 \
+		  _callt_save_29 \
+		  _callt_save_20c \
+		  _callt_save_21c \
+		  _callt_save_22c \
+		  _callt_save_23c \
+		  _callt_save_24c \
+		  _callt_save_25c \
+		  _callt_save_26c \
+		  _callt_save_27c \
+		  _callt_save_28c \
+		  _callt_save_29c \
+		  _callt_save_31c \
+		  _callt_save_interrupt \
+		  _callt_save_all_interrupt \
+		  _callt_save_r2_r29 \
+		  _callt_save_r2_r31 \
+		  _negdi2 \
+		  _cmpdi2 \
+		  _ucmpdi2 \
+		  _muldi3
diff --git a/libgcc/config/vax/lib1funcs.S b/libgcc/config/vax/lib1funcs.S
new file mode 100644
index 00000000000..1d57b56dad9
--- /dev/null
+++ b/libgcc/config/vax/lib1funcs.S
@@ -0,0 +1,92 @@
+/* Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of GCC.
+   Contributed by Maciej W. Rozycki <macro@linux-mips.org>.
+
+   This file is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by the
+   Free Software Foundation; either version 3, or (at your option) any
+   later version.
+
+   This file is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef L_udivsi3
+	.text
+	.globl	__udivsi3
+	.type	__udivsi3, @function
+__udivsi3:
+	.word	0
+	movl	8(%ap), %r1
+	blss	0f			/* Check bit #31 of divisor.  */
+	movl	4(%ap), %r2
+	blss	1f			/* Check bit #31 of dividend.  */
+
+	/* Both zero, do a standard division.  */
+
+	divl3	%r1, %r2, %r0
+	ret
+
+	/* MSB of divisor set, only 1 or 0 may result.  */
+0:
+	decl	%r1
+	clrl	%r0
+	cmpl	%r1, 4(%ap)
+	adwc	$0, %r0
+	ret
+
+	/* MSB of dividend set, do an extended division.  */
+1:
+	clrl	%r3
+	ediv	%r1, %r2, %r0, %r3
+	ret
+	.size	__udivsi3, . - __udivsi3
+	.previous
+#endif
+
+#ifdef L_umodsi3
+	.text
+	.globl	__umodsi3
+	.type	__umodsi3, @function
+__umodsi3:
+	.word	0
+	movl	8(%ap), %r1
+	blss	0f			/* Check bit #31 of divisor.  */
+	movl	4(%ap), %r2
+	blss	1f			/* Check bit #31 of dividend.  */
+
+	/* Both zero, do a standard division.  */
+
+	divl3	%r1, %r2, %r0
+	mull2	%r0, %r1
+	subl3	%r1, %r2, %r0
+	ret
+
+	/* MSB of divisor set, subtract the divisor at most once.  */
+0:
+	movl	4(%ap), %r2
+	clrl	%r0
+	cmpl	%r2, %r1
+	sbwc	$0, %r0
+	bicl2	%r0, %r1
+	subl3	%r1, %r2, %r0
+	ret
+
+	/* MSB of dividend set, do an extended division.  */
+1:
+	clrl	%r3
+	ediv	%r1, %r2, %r3, %r0
+	ret
+	.size	__umodsi3, . - __umodsi3
+	.previous
+#endif
diff --git a/libgcc/config/vax/t-linux b/libgcc/config/vax/t-linux
new file mode 100644
index 00000000000..17929c8717c
--- /dev/null
+++ b/libgcc/config/vax/t-linux
@@ -0,0 +1,2 @@
+LIB1ASMSRC = vax/lib1funcs.S
+LIB1ASMFUNCS = _udivsi3 _umodsi3
diff --git a/libgcc/config/xtensa/ieee754-df.S b/libgcc/config/xtensa/ieee754-df.S
new file mode 100644
index 00000000000..9b46889bdc2
--- /dev/null
+++ b/libgcc/config/xtensa/ieee754-df.S
@@ -0,0 +1,2388 @@
+/* IEEE-754 double-precision functions for Xtensa
+   Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc.
+   Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef __XTENSA_EB__
+#define xh a2
+#define xl a3
+#define yh a4
+#define yl a5
+#else
+#define xh a3
+#define xl a2
+#define yh a5
+#define yl a4
+#endif
+
+/*  Warning!  The branch displacements for some Xtensa branch instructions
+    are quite small, and this code has been carefully laid out to keep
+    branch targets in range.  If you change anything, be sure to check that
+    the assembler is not relaxing anything to branch over a jump.  */
+
+#ifdef L_negdf2
+
+	.align	4
+	.global	__negdf2
+	.type	__negdf2, @function
+__negdf2:
+	leaf_entry sp, 16
+	movi	a4, 0x80000000
+	xor	xh, xh, a4
+	leaf_return
+
+#endif /* L_negdf2 */
+
+#ifdef L_addsubdf3
+
+	/* Addition */
+__adddf3_aux:
+	
+	/* Handle NaNs and Infinities.  (This code is placed before the
+	   start of the function just to keep it in range of the limited
+	   branch displacements.)  */
+
+.Ladd_xnan_or_inf:
+	/* If y is neither Infinity nor NaN, return x.  */
+	bnall	yh, a6, 1f
+	/* If x is a NaN, return it.  Otherwise, return y.  */
+	slli	a7, xh, 12
+	or	a7, a7, xl
+	beqz	a7, .Ladd_ynan_or_inf
+1:	leaf_return
+
+.Ladd_ynan_or_inf:
+	/* Return y.  */
+	mov	xh, yh
+	mov	xl, yl
+	leaf_return
+
+.Ladd_opposite_signs:
+	/* Operand signs differ.  Do a subtraction.  */
+	slli	a7, a6, 11
+	xor	yh, yh, a7
+	j	.Lsub_same_sign
+
+	.align	4
+	.global	__adddf3
+	.type	__adddf3, @function
+__adddf3:
+	leaf_entry sp, 16
+	movi	a6, 0x7ff00000
+
+	/* Check if the two operands have the same sign.  */
+	xor	a7, xh, yh
+	bltz	a7, .Ladd_opposite_signs
+
+.Ladd_same_sign:	
+	/* Check if either exponent == 0x7ff (i.e., NaN or Infinity).  */
+	ball	xh, a6, .Ladd_xnan_or_inf
+	ball	yh, a6, .Ladd_ynan_or_inf
+
+	/* Compare the exponents.  The smaller operand will be shifted
+	   right by the exponent difference and added to the larger
+	   one.  */
+	extui	a7, xh, 20, 12
+	extui	a8, yh, 20, 12
+	bltu	a7, a8, .Ladd_shiftx
+
+.Ladd_shifty:
+	/* Check if the smaller (or equal) exponent is zero.  */
+	bnone	yh, a6, .Ladd_yexpzero
+
+	/* Replace yh sign/exponent with 0x001.  */
+	or	yh, yh, a6
+	slli	yh, yh, 11
+	srli	yh, yh, 11
+
+.Ladd_yexpdiff:
+	/* Compute the exponent difference.  Optimize for difference < 32.  */
+	sub	a10, a7, a8
+	bgeui	a10, 32, .Ladd_bigshifty
+	
+	/* Shift yh/yl right by the exponent difference.  Any bits that are
+	   shifted out of yl are saved in a9 for rounding the result.  */
+	ssr	a10
+	movi	a9, 0
+	src	a9, yl, a9
+	src	yl, yh, yl
+	srl	yh, yh
+
+.Ladd_addy:
+	/* Do the 64-bit addition.  */
+	add	xl, xl, yl
+	add	xh, xh, yh
+	bgeu	xl, yl, 1f
+	addi	xh, xh, 1
+1:
+	/* Check if the add overflowed into the exponent.  */
+	extui	a10, xh, 20, 12
+	beq	a10, a7, .Ladd_round
+	mov	a8, a7
+	j	.Ladd_carry
+
+.Ladd_yexpzero:
+	/* y is a subnormal value.  Replace its sign/exponent with zero,
+	   i.e., no implicit "1.0", and increment the apparent exponent
+	   because subnormals behave as if they had the minimum (nonzero)
+	   exponent.  Test for the case when both exponents are zero.  */
+	slli	yh, yh, 12
+	srli	yh, yh, 12
+	bnone	xh, a6, .Ladd_bothexpzero
+	addi	a8, a8, 1
+	j	.Ladd_yexpdiff
+
+.Ladd_bothexpzero:
+	/* Both exponents are zero.  Handle this as a special case.  There
+	   is no need to shift or round, and the normal code for handling
+	   a carry into the exponent field will not work because it
+	   assumes there is an implicit "1.0" that needs to be added.  */
+	add	xl, xl, yl
+	add	xh, xh, yh
+	bgeu	xl, yl, 1f
+	addi	xh, xh, 1
+1:	leaf_return
+
+.Ladd_bigshifty:
+	/* Exponent difference > 64 -- just return the bigger value.  */
+	bgeui	a10, 64, 1b
+
+	/* Shift yh/yl right by the exponent difference.  Any bits that are
+	   shifted out are saved in a9 for rounding the result.  */
+	ssr	a10
+	sll	a11, yl		/* lost bits shifted out of yl */
+	src	a9, yh, yl
+	srl	yl, yh
+	movi	yh, 0
+	beqz	a11, .Ladd_addy
+	or	a9, a9, a10	/* any positive, nonzero value will work */
+	j	.Ladd_addy
+
+.Ladd_xexpzero:
+	/* Same as "yexpzero" except skip handling the case when both
+	   exponents are zero.  */
+	slli	xh, xh, 12
+	srli	xh, xh, 12
+	addi	a7, a7, 1
+	j	.Ladd_xexpdiff
+
+.Ladd_shiftx:
+	/* Same thing as the "shifty" code, but with x and y swapped.  Also,
+	   because the exponent difference is always nonzero in this version,
+	   the shift sequence can use SLL and skip loading a constant zero.  */
+	bnone	xh, a6, .Ladd_xexpzero
+
+	or	xh, xh, a6
+	slli	xh, xh, 11
+	srli	xh, xh, 11
+
+.Ladd_xexpdiff:
+	sub	a10, a8, a7
+	bgeui	a10, 32, .Ladd_bigshiftx
+	
+	ssr	a10
+	sll	a9, xl
+	src	xl, xh, xl
+	srl	xh, xh
+
+.Ladd_addx:
+	add	xl, xl, yl
+	add	xh, xh, yh
+	bgeu	xl, yl, 1f
+	addi	xh, xh, 1
+1:
+	/* Check if the add overflowed into the exponent.  */
+	extui	a10, xh, 20, 12
+	bne	a10, a8, .Ladd_carry
+
+.Ladd_round:
+	/* Round up if the leftover fraction is >= 1/2.  */
+	bgez	a9, 1f
+	addi	xl, xl, 1
+	beqz	xl, .Ladd_roundcarry
+
+	/* Check if the leftover fraction is exactly 1/2.  */
+	slli	a9, a9, 1
+	beqz	a9, .Ladd_exactlyhalf
+1:	leaf_return
+
+.Ladd_bigshiftx:
+	/* Mostly the same thing as "bigshifty"....  */
+	bgeui	a10, 64, .Ladd_returny
+
+	ssr	a10
+	sll	a11, xl
+	src	a9, xh, xl
+	srl	xl, xh
+	movi	xh, 0
+	beqz	a11, .Ladd_addx
+	or	a9, a9, a10
+	j	.Ladd_addx
+
+.Ladd_returny:
+	mov	xh, yh
+	mov	xl, yl
+	leaf_return
+
+.Ladd_carry:	
+	/* The addition has overflowed into the exponent field, so the
+	   value needs to be renormalized.  The mantissa of the result
+	   can be recovered by subtracting the original exponent and
+	   adding 0x100000 (which is the explicit "1.0" for the
+	   mantissa of the non-shifted operand -- the "1.0" for the
+	   shifted operand was already added).  The mantissa can then
+	   be shifted right by one bit.  The explicit "1.0" of the
+	   shifted mantissa then needs to be replaced by the exponent,
+	   incremented by one to account for the normalizing shift.
+	   It is faster to combine these operations: do the shift first
+	   and combine the additions and subtractions.  If x is the
+	   original exponent, the result is:
+	       shifted mantissa - (x << 19) + (1 << 19) + (x << 20)
+	   or:
+	       shifted mantissa + ((x + 1) << 19)
+	   Note that the exponent is incremented here by leaving the
+	   explicit "1.0" of the mantissa in the exponent field.  */
+
+	/* Shift xh/xl right by one bit.  Save the lsb of xl.  */
+	mov	a10, xl
+	ssai	1
+	src	xl, xh, xl
+	srl	xh, xh
+
+	/* See explanation above.  The original exponent is in a8.  */
+	addi	a8, a8, 1
+	slli	a8, a8, 19
+	add	xh, xh, a8
+
+	/* Return an Infinity if the exponent overflowed.  */
+	ball	xh, a6, .Ladd_infinity
+	
+	/* Same thing as the "round" code except the msb of the leftover
+	   fraction is bit 0 of a10, with the rest of the fraction in a9.  */
+	bbci.l	a10, 0, 1f
+	addi	xl, xl, 1
+	beqz	xl, .Ladd_roundcarry
+	beqz	a9, .Ladd_exactlyhalf
+1:	leaf_return
+
+.Ladd_infinity:
+	/* Clear the mantissa.  */
+	movi	xl, 0
+	srli	xh, xh, 20
+	slli	xh, xh, 20
+
+	/* The sign bit may have been lost in a carry-out.  Put it back.  */
+	slli	a8, a8, 1
+	or	xh, xh, a8
+	leaf_return
+
+.Ladd_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	xl, xl, 1
+	slli	xl, xl, 1
+	leaf_return
+
+.Ladd_roundcarry:
+	/* xl is always zero when the rounding increment overflows, so
+	   there's no need to round it to an even value.  */
+	addi	xh, xh, 1
+	/* Overflow to the exponent is OK.  */
+	leaf_return
+
+
+	/* Subtraction */
+__subdf3_aux:
+	
+	/* Handle NaNs and Infinities.  (This code is placed before the
+	   start of the function just to keep it in range of the limited
+	   branch displacements.)  */
+
+.Lsub_xnan_or_inf:
+	/* If y is neither Infinity nor NaN, return x.  */
+	bnall	yh, a6, 1f
+	/* Both x and y are either NaN or Inf, so the result is NaN.  */
+	movi	a4, 0x80000	/* make it a quiet NaN */
+	or	xh, xh, a4
+1:	leaf_return
+
+.Lsub_ynan_or_inf:
+	/* Negate y and return it.  */
+	slli	a7, a6, 11
+	xor	xh, yh, a7
+	mov	xl, yl
+	leaf_return
+
+.Lsub_opposite_signs:
+	/* Operand signs differ.  Do an addition.  */
+	slli	a7, a6, 11
+	xor	yh, yh, a7
+	j	.Ladd_same_sign
+
+	.align	4
+	.global	__subdf3
+	.type	__subdf3, @function
+__subdf3:
+	leaf_entry sp, 16
+	movi	a6, 0x7ff00000
+
+	/* Check if the two operands have the same sign.  */
+	xor	a7, xh, yh
+	bltz	a7, .Lsub_opposite_signs
+
+.Lsub_same_sign:	
+	/* Check if either exponent == 0x7ff (i.e., NaN or Infinity).  */
+	ball	xh, a6, .Lsub_xnan_or_inf
+	ball	yh, a6, .Lsub_ynan_or_inf
+
+	/* Compare the operands.  In contrast to addition, the entire
+	   value matters here.  */
+	extui	a7, xh, 20, 11
+	extui	a8, yh, 20, 11
+	bltu	xh, yh, .Lsub_xsmaller
+	beq	xh, yh, .Lsub_compare_low
+
+.Lsub_ysmaller:
+	/* Check if the smaller (or equal) exponent is zero.  */
+	bnone	yh, a6, .Lsub_yexpzero
+
+	/* Replace yh sign/exponent with 0x001.  */
+	or	yh, yh, a6
+	slli	yh, yh, 11
+	srli	yh, yh, 11
+
+.Lsub_yexpdiff:
+	/* Compute the exponent difference.  Optimize for difference < 32.  */
+	sub	a10, a7, a8
+	bgeui	a10, 32, .Lsub_bigshifty
+	
+	/* Shift yh/yl right by the exponent difference.  Any bits that are
+	   shifted out of yl are saved in a9 for rounding the result.  */
+	ssr	a10
+	movi	a9, 0
+	src	a9, yl, a9
+	src	yl, yh, yl
+	srl	yh, yh
+
+.Lsub_suby:
+	/* Do the 64-bit subtraction.  */
+	sub	xh, xh, yh
+	bgeu	xl, yl, 1f
+	addi	xh, xh, -1
+1:	sub	xl, xl, yl
+
+	/* Subtract the leftover bits in a9 from zero and propagate any
+	   borrow from xh/xl.  */
+	neg	a9, a9
+	beqz	a9, 1f
+	addi	a5, xh, -1
+	moveqz	xh, a5, xl
+	addi	xl, xl, -1
+1:
+	/* Check if the subtract underflowed into the exponent.  */
+	extui	a10, xh, 20, 11
+	beq	a10, a7, .Lsub_round
+	j	.Lsub_borrow
+
+.Lsub_compare_low:
+	/* The high words are equal.  Compare the low words.  */
+	bltu	xl, yl, .Lsub_xsmaller
+	bltu	yl, xl, .Lsub_ysmaller
+	/* The operands are equal.  Return 0.0.  */
+	movi	xh, 0
+	movi	xl, 0
+1:	leaf_return
+
+.Lsub_yexpzero:
+	/* y is a subnormal value.  Replace its sign/exponent with zero,
+	   i.e., no implicit "1.0".  Unless x is also a subnormal, increment
+	   y's apparent exponent because subnormals behave as if they had
+	   the minimum (nonzero) exponent.  */
+	slli	yh, yh, 12
+	srli	yh, yh, 12
+	bnone	xh, a6, .Lsub_yexpdiff
+	addi	a8, a8, 1
+	j	.Lsub_yexpdiff
+
+.Lsub_bigshifty:
+	/* Exponent difference > 64 -- just return the bigger value.  */
+	bgeui	a10, 64, 1b
+
+	/* Shift yh/yl right by the exponent difference.  Any bits that are
+	   shifted out are saved in a9 for rounding the result.  */
+	ssr	a10
+	sll	a11, yl		/* lost bits shifted out of yl */
+	src	a9, yh, yl
+	srl	yl, yh
+	movi	yh, 0
+	beqz	a11, .Lsub_suby
+	or	a9, a9, a10	/* any positive, nonzero value will work */
+	j	.Lsub_suby
+
+.Lsub_xsmaller:
+	/* Same thing as the "ysmaller" code, but with x and y swapped and
+	   with y negated.  */
+	bnone	xh, a6, .Lsub_xexpzero
+
+	or	xh, xh, a6
+	slli	xh, xh, 11
+	srli	xh, xh, 11
+
+.Lsub_xexpdiff:
+	sub	a10, a8, a7
+	bgeui	a10, 32, .Lsub_bigshiftx
+	
+	ssr	a10
+	movi	a9, 0
+	src	a9, xl, a9
+	src	xl, xh, xl
+	srl	xh, xh
+
+	/* Negate y.  */
+	slli	a11, a6, 11
+	xor	yh, yh, a11
+
+.Lsub_subx:
+	sub	xl, yl, xl
+	sub	xh, yh, xh
+	bgeu	yl, xl, 1f
+	addi	xh, xh, -1
+1:
+	/* Subtract the leftover bits in a9 from zero and propagate any
+	   borrow from xh/xl.  */
+	neg	a9, a9
+	beqz	a9, 1f
+	addi	a5, xh, -1
+	moveqz	xh, a5, xl
+	addi	xl, xl, -1
+1:
+	/* Check if the subtract underflowed into the exponent.  */
+	extui	a10, xh, 20, 11
+	bne	a10, a8, .Lsub_borrow
+
+.Lsub_round:
+	/* Round up if the leftover fraction is >= 1/2.  */
+	bgez	a9, 1f
+	addi	xl, xl, 1
+	beqz	xl, .Lsub_roundcarry
+
+	/* Check if the leftover fraction is exactly 1/2.  */
+	slli	a9, a9, 1
+	beqz	a9, .Lsub_exactlyhalf
+1:	leaf_return
+
+.Lsub_xexpzero:
+	/* Same as "yexpzero".  */
+	slli	xh, xh, 12
+	srli	xh, xh, 12
+	bnone	yh, a6, .Lsub_xexpdiff
+	addi	a7, a7, 1
+	j	.Lsub_xexpdiff
+
+.Lsub_bigshiftx:
+	/* Mostly the same thing as "bigshifty", but with the sign bit of the
+	   shifted value set so that the subsequent subtraction flips the
+	   sign of y.  */
+	bgeui	a10, 64, .Lsub_returny
+
+	ssr	a10
+	sll	a11, xl
+	src	a9, xh, xl
+	srl	xl, xh
+	slli	xh, a6, 11	/* set sign bit of xh */
+	beqz	a11, .Lsub_subx
+	or	a9, a9, a10
+	j	.Lsub_subx
+
+.Lsub_returny:
+	/* Negate and return y.  */
+	slli	a7, a6, 11
+	xor	xh, yh, a7
+	mov	xl, yl
+	leaf_return
+
+.Lsub_borrow:	
+	/* The subtraction has underflowed into the exponent field, so the
+	   value needs to be renormalized.  Shift the mantissa left as
+	   needed to remove any leading zeros and adjust the exponent
+	   accordingly.  If the exponent is not large enough to remove
+	   all the leading zeros, the result will be a subnormal value.  */
+
+	slli	a8, xh, 12
+	beqz	a8, .Lsub_xhzero
+	do_nsau	a6, a8, a7, a11
+	srli	a8, a8, 12
+	bge	a6, a10, .Lsub_subnormal
+	addi	a6, a6, 1
+
+.Lsub_shift_lt32:
+	/* Shift the mantissa (a8/xl/a9) left by a6.  */
+	ssl	a6
+	src	a8, a8, xl
+	src	xl, xl, a9
+	sll	a9, a9
+
+	/* Combine the shifted mantissa with the sign and exponent,
+	   decrementing the exponent by a6.  (The exponent has already
+	   been decremented by one due to the borrow from the subtraction,
+	   but adding the mantissa will increment the exponent by one.)  */
+	srli	xh, xh, 20
+	sub	xh, xh, a6
+	slli	xh, xh, 20
+	add	xh, xh, a8
+	j	.Lsub_round
+
+.Lsub_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	xl, xl, 1
+	slli	xl, xl, 1
+	leaf_return
+
+.Lsub_roundcarry:
+	/* xl is always zero when the rounding increment overflows, so
+	   there's no need to round it to an even value.  */
+	addi	xh, xh, 1
+	/* Overflow to the exponent is OK.  */
+	leaf_return
+
+.Lsub_xhzero:
+	/* When normalizing the result, all the mantissa bits in the high
+	   word are zero.  Shift by "20 + (leading zero count of xl) + 1".  */
+	do_nsau	a6, xl, a7, a11
+	addi	a6, a6, 21
+	blt	a10, a6, .Lsub_subnormal
+
+.Lsub_normalize_shift:
+	bltui	a6, 32, .Lsub_shift_lt32
+
+	ssl	a6
+	src	a8, xl, a9
+	sll	xl, a9
+	movi	a9, 0
+
+	srli	xh, xh, 20
+	sub	xh, xh, a6
+	slli	xh, xh, 20
+	add	xh, xh, a8
+	j	.Lsub_round
+
+.Lsub_subnormal:
+	/* The exponent is too small to shift away all the leading zeros.
+	   Set a6 to the current exponent (which has already been
+	   decremented by the borrow) so that the exponent of the result
+	   will be zero.  Do not add 1 to a6 in this case, because: (1)
+	   adding the mantissa will not increment the exponent, so there is
+	   no need to subtract anything extra from the exponent to
+	   compensate, and (2) the effective exponent of a subnormal is 1
+	   not 0 so the shift amount must be 1 smaller than normal. */
+	mov	a6, a10
+	j	.Lsub_normalize_shift
+
+#endif /* L_addsubdf3 */
+
+#ifdef L_muldf3
+
+	/* Multiplication */
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
+__muldf3_aux:
+
+	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
+	   (This code is placed before the start of the function just to
+	   keep it in range of the limited branch displacements.)  */
+
+.Lmul_xexpzero:
+	/* Clear the sign bit of x.  */
+	slli	xh, xh, 1
+	srli	xh, xh, 1
+
+	/* If x is zero, return zero.  */
+	or	a10, xh, xl
+	beqz	a10, .Lmul_return_zero
+
+	/* Normalize x.  Adjust the exponent in a8.  */
+	beqz	xh, .Lmul_xh_zero
+	do_nsau	a10, xh, a11, a12
+	addi	a10, a10, -11
+	ssl	a10
+	src	xh, xh, xl
+	sll	xl, xl
+	movi	a8, 1
+	sub	a8, a8, a10
+	j	.Lmul_xnormalized	
+.Lmul_xh_zero:
+	do_nsau	a10, xl, a11, a12
+	addi	a10, a10, -11
+	movi	a8, -31
+	sub	a8, a8, a10
+	ssl	a10
+	bltz	a10, .Lmul_xl_srl
+	sll	xh, xl
+	movi	xl, 0
+	j	.Lmul_xnormalized
+.Lmul_xl_srl:
+	srl	xh, xl
+	sll	xl, xl
+	j	.Lmul_xnormalized
+	
+.Lmul_yexpzero:
+	/* Clear the sign bit of y.  */
+	slli	yh, yh, 1
+	srli	yh, yh, 1
+
+	/* If y is zero, return zero.  */
+	or	a10, yh, yl
+	beqz	a10, .Lmul_return_zero
+
+	/* Normalize y.  Adjust the exponent in a9.  */
+	beqz	yh, .Lmul_yh_zero
+	do_nsau	a10, yh, a11, a12
+	addi	a10, a10, -11
+	ssl	a10
+	src	yh, yh, yl
+	sll	yl, yl
+	movi	a9, 1
+	sub	a9, a9, a10
+	j	.Lmul_ynormalized	
+.Lmul_yh_zero:
+	do_nsau	a10, yl, a11, a12
+	addi	a10, a10, -11
+	movi	a9, -31
+	sub	a9, a9, a10
+	ssl	a10
+	bltz	a10, .Lmul_yl_srl
+	sll	yh, yl
+	movi	yl, 0
+	j	.Lmul_ynormalized
+.Lmul_yl_srl:
+	srl	yh, yl
+	sll	yl, yl
+	j	.Lmul_ynormalized	
+
+.Lmul_return_zero:
+	/* Return zero with the appropriate sign bit.  */
+	srli	xh, a7, 31
+	slli	xh, xh, 31
+	movi	xl, 0
+	j	.Lmul_done
+
+.Lmul_xnan_or_inf:
+	/* If y is zero, return NaN.  */
+	bnez	yl, 1f
+	slli	a8, yh, 1
+	bnez	a8, 1f
+	movi	a4, 0x80000	/* make it a quiet NaN */
+	or	xh, xh, a4
+	j	.Lmul_done
+1:
+	/* If y is NaN, return y.  */
+	bnall	yh, a6, .Lmul_returnx
+	slli	a8, yh, 12
+	or	a8, a8, yl
+	beqz	a8, .Lmul_returnx
+
+.Lmul_returny:
+	mov	xh, yh
+	mov	xl, yl
+
+.Lmul_returnx:
+	/* Set the sign bit and return.  */
+	extui	a7, a7, 31, 1
+	slli	xh, xh, 1
+	ssai	1
+	src	xh, a7, xh
+	j	.Lmul_done
+
+.Lmul_ynan_or_inf:
+	/* If x is zero, return NaN.  */
+	bnez	xl, .Lmul_returny
+	slli	a8, xh, 1
+	bnez	a8, .Lmul_returny
+	movi	a7, 0x80000	/* make it a quiet NaN */
+	or	xh, yh, a7
+	j	.Lmul_done
+
+	.align	4
+	.global	__muldf3
+	.type	__muldf3, @function
+__muldf3:
+#if __XTENSA_CALL0_ABI__
+	leaf_entry sp, 32
+	addi	sp, sp, -32
+	s32i	a12, sp, 16
+	s32i	a13, sp, 20
+	s32i	a14, sp, 24
+	s32i	a15, sp, 28
+#elif XCHAL_NO_MUL
+	/* This is not really a leaf function; allocate enough stack space
+	   to allow CALL12s to a helper function.  */
+	leaf_entry sp, 64
+#else
+	leaf_entry sp, 32
+#endif
+	movi	a6, 0x7ff00000
+
+	/* Get the sign of the result.  */
+	xor	a7, xh, yh
+
+	/* Check for NaN and infinity.  */
+	ball	xh, a6, .Lmul_xnan_or_inf
+	ball	yh, a6, .Lmul_ynan_or_inf
+
+	/* Extract the exponents.  */
+	extui	a8, xh, 20, 11
+	extui	a9, yh, 20, 11
+
+	beqz	a8, .Lmul_xexpzero
+.Lmul_xnormalized:	
+	beqz	a9, .Lmul_yexpzero
+.Lmul_ynormalized:	
+
+	/* Add the exponents.  */
+	add	a8, a8, a9
+
+	/* Replace sign/exponent fields with explicit "1.0".  */
+	movi	a10, 0x1fffff
+	or	xh, xh, a6
+	and	xh, xh, a10
+	or	yh, yh, a6
+	and	yh, yh, a10
+
+	/* Multiply 64x64 to 128 bits.  The result ends up in xh/xl/a6.
+	   The least-significant word of the result is thrown away except
+	   that if it is nonzero, the lsb of a6 is set to 1.  */
+#if XCHAL_HAVE_MUL32_HIGH
+
+	/* Compute a6 with any carry-outs in a10.  */
+	movi	a10, 0
+	mull	a6, xl, yh
+	mull	a11, xh, yl
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a10, a10, 1
+1:
+	muluh	a11, xl, yl
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a10, a10, 1
+1:	
+	/* If the low word of the result is nonzero, set the lsb of a6.  */
+	mull	a11, xl, yl
+	beqz	a11, 1f
+	movi	a9, 1
+	or	a6, a6, a9
+1:
+	/* Compute xl with any carry-outs in a9.  */
+	movi	a9, 0
+	mull	a11, xh, yh
+	add	a10, a10, a11
+	bgeu	a10, a11, 1f
+	addi	a9, a9, 1
+1:	
+	muluh	a11, xh, yl
+	add	a10, a10, a11
+	bgeu	a10, a11, 1f
+	addi	a9, a9, 1
+1:	
+	muluh	xl, xl, yh
+	add	xl, xl, a10
+	bgeu	xl, a10, 1f
+	addi	a9, a9, 1
+1:
+	/* Compute xh.  */
+	muluh	xh, xh, yh
+	add	xh, xh, a9
+
+#else /* ! XCHAL_HAVE_MUL32_HIGH */
+
+	/* Break the inputs into 16-bit chunks and compute 16 32-bit partial
+	   products.  These partial products are:
+
+		0 xll * yll
+
+		1 xll * ylh
+		2 xlh * yll
+
+		3 xll * yhl
+		4 xlh * ylh
+		5 xhl * yll
+
+		6 xll * yhh
+		7 xlh * yhl
+		8 xhl * ylh
+		9 xhh * yll
+
+		10 xlh * yhh
+		11 xhl * yhl
+		12 xhh * ylh
+
+		13 xhl * yhh
+		14 xhh * yhl
+
+		15 xhh * yhh
+
+	   where the input chunks are (hh, hl, lh, ll).  If using the Mul16
+	   or Mul32 multiplier options, these input chunks must be stored in
+	   separate registers.  For Mac16, the UMUL.AA.* opcodes can specify
+	   that the inputs come from either half of the registers, so there
+	   is no need to shift them out ahead of time.  If there is no
+	   multiply hardware, the 16-bit chunks can be extracted when setting
+	   up the arguments to the separate multiply function.  */
+
+	/* Save a7 since it is needed to hold a temporary value.  */
+	s32i	a7, sp, 4
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
+	/* Calling a separate multiply function will clobber a0 and requires
+	   use of a8 as a temporary, so save those values now.  (The function
+	   uses a custom ABI so nothing else needs to be saved.)  */
+	s32i	a0, sp, 0
+	s32i	a8, sp, 8
+#endif
+
+#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
+
+#define xlh a12
+#define ylh a13
+#define xhh a14
+#define yhh a15
+
+	/* Get the high halves of the inputs into registers.  */
+	srli	xlh, xl, 16
+	srli	ylh, yl, 16
+	srli	xhh, xh, 16
+	srli	yhh, yh, 16
+
+#define xll xl
+#define yll yl
+#define xhl xh
+#define yhl yh
+
+#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
+	/* Clear the high halves of the inputs.  This does not matter
+	   for MUL16 because the high bits are ignored.  */
+	extui	xl, xl, 0, 16
+	extui	xh, xh, 0, 16
+	extui	yl, yl, 0, 16
+	extui	yh, yh, 0, 16
+#endif
+#endif /* MUL16 || MUL32 */
+
+
+#if XCHAL_HAVE_MUL16
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	mul16u	dst, xreg ## xhalf, yreg ## yhalf
+
+#elif XCHAL_HAVE_MUL32
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	mull	dst, xreg ## xhalf, yreg ## yhalf
+
+#elif XCHAL_HAVE_MAC16
+
+/* The preprocessor insists on inserting a space when concatenating after
+   a period in the definition of do_mul below.  These macros are a workaround
+   using underscores instead of periods when doing the concatenation.  */
+#define umul_aa_ll umul.aa.ll
+#define umul_aa_lh umul.aa.lh
+#define umul_aa_hl umul.aa.hl
+#define umul_aa_hh umul.aa.hh
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
+	rsr	dst, ACCLO
+
+#else /* no multiply hardware */
+	
+#define set_arg_l(dst, src) \
+	extui	dst, src, 0, 16
+#define set_arg_h(dst, src) \
+	srli	dst, src, 16
+
+#if __XTENSA_CALL0_ABI__
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a13, xreg); \
+	set_arg_ ## yhalf (a14, yreg); \
+	call0	.Lmul_mulsi3; \
+	mov	dst, a12
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a14, xreg); \
+	set_arg_ ## yhalf (a15, yreg); \
+	call12	.Lmul_mulsi3; \
+	mov	dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
+
+	/* Add pp1 and pp2 into a10 with carry-out in a9.  */
+	do_mul(a10, xl, l, yl, h)	/* pp 1 */
+	do_mul(a11, xl, h, yl, l)	/* pp 2 */
+	movi	a9, 0
+	add	a10, a10, a11
+	bgeu	a10, a11, 1f
+	addi	a9, a9, 1
+1:
+	/* Initialize a6 with a9/a10 shifted into position.  Note that
+	   this value can be safely incremented without any carry-outs.  */
+	ssai	16
+	src	a6, a9, a10
+
+	/* Compute the low word into a10.  */
+	do_mul(a11, xl, l, yl, l)	/* pp 0 */
+	sll	a10, a10
+	add	a10, a10, a11
+	bgeu	a10, a11, 1f
+	addi	a6, a6, 1
+1:
+	/* Compute the contributions of pp0-5 to a6, with carry-outs in a9.
+	   This is good enough to determine the low half of a6, so that any
+	   nonzero bits from the low word of the result can be collapsed
+	   into a6, freeing up a register.  */
+	movi	a9, 0
+	do_mul(a11, xl, l, yh, l)	/* pp 3 */
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a9, a9, 1
+1:
+	do_mul(a11, xl, h, yl, h)	/* pp 4 */
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a9, a9, 1
+1:
+	do_mul(a11, xh, l, yl, l)	/* pp 5 */
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a9, a9, 1
+1:
+	/* Collapse any nonzero bits from the low word into a6.  */
+	beqz	a10, 1f
+	movi	a11, 1
+	or	a6, a6, a11
+1:
+	/* Add pp6-9 into a11 with carry-outs in a10.  */
+	do_mul(a7, xl, l, yh, h)	/* pp 6 */
+	do_mul(a11, xh, h, yl, l)	/* pp 9 */
+	movi	a10, 0
+	add	a11, a11, a7
+	bgeu	a11, a7, 1f
+	addi	a10, a10, 1
+1:	
+	do_mul(a7, xl, h, yh, l)	/* pp 7 */
+	add	a11, a11, a7
+	bgeu	a11, a7, 1f
+	addi	a10, a10, 1
+1:	
+	do_mul(a7, xh, l, yl, h)	/* pp 8 */
+	add	a11, a11, a7
+	bgeu	a11, a7, 1f
+	addi	a10, a10, 1
+1:	
+	/* Shift a10/a11 into position, and add low half of a11 to a6.  */
+	src	a10, a10, a11
+	add	a10, a10, a9
+	sll	a11, a11
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a10, a10, 1
+1:
+	/* Add pp10-12 into xl with carry-outs in a9.  */
+	movi	a9, 0
+	do_mul(xl, xl, h, yh, h)	/* pp 10 */
+	add	xl, xl, a10
+	bgeu	xl, a10, 1f
+	addi	a9, a9, 1
+1:
+	do_mul(a10, xh, l, yh, l)	/* pp 11 */
+	add	xl, xl, a10
+	bgeu	xl, a10, 1f
+	addi	a9, a9, 1
+1:
+	do_mul(a10, xh, h, yl, h)	/* pp 12 */
+	add	xl, xl, a10
+	bgeu	xl, a10, 1f
+	addi	a9, a9, 1
+1:
+	/* Add pp13-14 into a11 with carry-outs in a10.  */
+	do_mul(a11, xh, l, yh, h)	/* pp 13 */
+	do_mul(a7, xh, h, yh, l)	/* pp 14 */
+	movi	a10, 0
+	add	a11, a11, a7
+	bgeu	a11, a7, 1f
+	addi	a10, a10, 1
+1:
+	/* Shift a10/a11 into position, and add low half of a11 to a6.  */
+	src	a10, a10, a11
+	add	a10, a10, a9
+	sll	a11, a11
+	add	xl, xl, a11
+	bgeu	xl, a11, 1f
+	addi	a10, a10, 1
+1:
+	/* Compute xh.  */
+	do_mul(xh, xh, h, yh, h)	/* pp 15 */
+	add	xh, xh, a10
+
+	/* Restore values saved on the stack during the multiplication.  */
+	l32i	a7, sp, 4
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
+	l32i	a0, sp, 0
+	l32i	a8, sp, 8
+#endif
+#endif /* ! XCHAL_HAVE_MUL32_HIGH */
+
+	/* Shift left by 12 bits, unless there was a carry-out from the
+	   multiply, in which case, shift by 11 bits and increment the
+	   exponent.  Note: It is convenient to use the constant 0x3ff
+	   instead of 0x400 when removing the extra exponent bias (so that
+	   it is easy to construct 0x7fe for the overflow check).  Reverse
+	   the logic here to decrement the exponent sum by one unless there
+	   was a carry-out.  */
+	movi	a4, 11
+	srli	a5, xh, 21 - 12
+	bnez	a5, 1f
+	addi	a4, a4, 1
+	addi	a8, a8, -1
+1:	ssl	a4
+	src	xh, xh, xl
+	src	xl, xl, a6
+	sll	a6, a6
+
+	/* Subtract the extra bias from the exponent sum (plus one to account
+	   for the explicit "1.0" of the mantissa that will be added to the
+	   exponent in the final result).  */
+	movi	a4, 0x3ff
+	sub	a8, a8, a4
+	
+	/* Check for over/underflow.  The value in a8 is one less than the
+	   final exponent, so values in the range 0..7fd are OK here.  */
+	slli	a4, a4, 1	/* 0x7fe */
+	bgeu	a8, a4, .Lmul_overflow
+	
+.Lmul_round:
+	/* Round.  */
+	bgez	a6, .Lmul_rounded
+	addi	xl, xl, 1
+	beqz	xl, .Lmul_roundcarry
+	slli	a6, a6, 1
+	beqz	a6, .Lmul_exactlyhalf
+
+.Lmul_rounded:
+	/* Add the exponent to the mantissa.  */
+	slli	a8, a8, 20
+	add	xh, xh, a8
+
+.Lmul_addsign:
+	/* Add the sign bit.  */
+	srli	a7, a7, 31
+	slli	a7, a7, 31
+	or	xh, xh, a7
+
+.Lmul_done:
+#if __XTENSA_CALL0_ABI__
+	l32i	a12, sp, 16
+	l32i	a13, sp, 20
+	l32i	a14, sp, 24
+	l32i	a15, sp, 28
+	addi	sp, sp, 32
+#endif
+	leaf_return
+
+.Lmul_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	xl, xl, 1
+	slli	xl, xl, 1
+	j	.Lmul_rounded
+
+.Lmul_roundcarry:
+	/* xl is always zero when the rounding increment overflows, so
+	   there's no need to round it to an even value.  */
+	addi	xh, xh, 1
+	/* Overflow is OK -- it will be added to the exponent.  */
+	j	.Lmul_rounded
+
+.Lmul_overflow:
+	bltz	a8, .Lmul_underflow
+	/* Return +/- Infinity.  */
+	addi	a8, a4, 1	/* 0x7ff */
+	slli	xh, a8, 20
+	movi	xl, 0
+	j	.Lmul_addsign
+
+.Lmul_underflow:
+	/* Create a subnormal value, where the exponent field contains zero,
+	   but the effective exponent is 1.  The value of a8 is one less than
+	   the actual exponent, so just negate it to get the shift amount.  */
+	neg	a8, a8
+	mov	a9, a6
+	ssr	a8
+	bgeui	a8, 32, .Lmul_bigshift
+	
+	/* Shift xh/xl right.  Any bits that are shifted out of xl are saved
+	   in a6 (combined with the shifted-out bits currently in a6) for
+	   rounding the result.  */
+	sll	a6, xl
+	src	xl, xh, xl
+	srl	xh, xh
+	j	1f
+
+.Lmul_bigshift:
+	bgeui	a8, 64, .Lmul_flush_to_zero
+	sll	a10, xl		/* lost bits shifted out of xl */
+	src	a6, xh, xl
+	srl	xl, xh
+	movi	xh, 0
+	or	a9, a9, a10
+
+	/* Set the exponent to zero.  */
+1:	movi	a8, 0
+
+	/* Pack any nonzero bits shifted out into a6.  */
+	beqz	a9, .Lmul_round
+	movi	a9, 1
+	or	a6, a6, a9
+	j	.Lmul_round
+	
+.Lmul_flush_to_zero:
+	/* Return zero with the appropriate sign bit.  */
+	srli	xh, a7, 31
+	slli	xh, xh, 31
+	movi	xl, 0
+	j	.Lmul_done
+
+#if XCHAL_NO_MUL
+	
+	/* For Xtensa processors with no multiply hardware, this simplified
+	   version of _mulsi3 is used for multiplying 16-bit chunks of
+	   the floating-point mantissas.  When using CALL0, this function
+	   uses a custom ABI: the inputs are passed in a13 and a14, the
+	   result is returned in a12, and a8 and a15 are clobbered.  */
+	.align	4
+.Lmul_mulsi3:
+	leaf_entry sp, 16
+	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+	movi	\dst, 0
+1:	add	\tmp1, \src2, \dst
+	extui	\tmp2, \src1, 0, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx2 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 1, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx4 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 2, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx8 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 3, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	srli	\src1, \src1, 4
+	slli	\src2, \src2, 4
+	bnez	\src1, 1b
+	.endm
+#if __XTENSA_CALL0_ABI__
+	mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+	/* The result will be written into a2, so save that argument in a4.  */
+	mov	a4, a2
+	mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+	leaf_return
+#endif /* XCHAL_NO_MUL */
+#endif /* L_muldf3 */
+
+#ifdef L_divdf3
+
+	/* Division */
+__divdf3_aux:
+
+	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
+	   (This code is placed before the start of the function just to
+	   keep it in range of the limited branch displacements.)  */
+
+.Ldiv_yexpzero:
+	/* Clear the sign bit of y.  */
+	slli	yh, yh, 1
+	srli	yh, yh, 1
+
+	/* Check for division by zero.  */
+	or	a10, yh, yl
+	beqz	a10, .Ldiv_yzero
+
+	/* Normalize y.  Adjust the exponent in a9.  */
+	beqz	yh, .Ldiv_yh_zero
+	do_nsau	a10, yh, a11, a9
+	addi	a10, a10, -11
+	ssl	a10
+	src	yh, yh, yl
+	sll	yl, yl
+	movi	a9, 1
+	sub	a9, a9, a10
+	j	.Ldiv_ynormalized	
+.Ldiv_yh_zero:
+	do_nsau	a10, yl, a11, a9
+	addi	a10, a10, -11
+	movi	a9, -31
+	sub	a9, a9, a10
+	ssl	a10
+	bltz	a10, .Ldiv_yl_srl
+	sll	yh, yl
+	movi	yl, 0
+	j	.Ldiv_ynormalized
+.Ldiv_yl_srl:
+	srl	yh, yl
+	sll	yl, yl
+	j	.Ldiv_ynormalized	
+
+.Ldiv_yzero:
+	/* y is zero.  Return NaN if x is also zero; otherwise, infinity.  */
+	slli	xh, xh, 1
+	srli	xh, xh, 1
+	or	xl, xl, xh
+	srli	xh, a7, 31
+	slli	xh, xh, 31
+	or	xh, xh, a6
+	bnez	xl, 1f
+	movi	a4, 0x80000	/* make it a quiet NaN */
+	or	xh, xh, a4
+1:	movi	xl, 0
+	leaf_return
+
+.Ldiv_xexpzero:
+	/* Clear the sign bit of x.  */
+	slli	xh, xh, 1
+	srli	xh, xh, 1
+
+	/* If x is zero, return zero.  */
+	or	a10, xh, xl
+	beqz	a10, .Ldiv_return_zero
+
+	/* Normalize x.  Adjust the exponent in a8.  */
+	beqz	xh, .Ldiv_xh_zero
+	do_nsau	a10, xh, a11, a8
+	addi	a10, a10, -11
+	ssl	a10
+	src	xh, xh, xl
+	sll	xl, xl
+	movi	a8, 1
+	sub	a8, a8, a10
+	j	.Ldiv_xnormalized	
+.Ldiv_xh_zero:
+	do_nsau	a10, xl, a11, a8
+	addi	a10, a10, -11
+	movi	a8, -31
+	sub	a8, a8, a10
+	ssl	a10
+	bltz	a10, .Ldiv_xl_srl
+	sll	xh, xl
+	movi	xl, 0
+	j	.Ldiv_xnormalized
+.Ldiv_xl_srl:
+	srl	xh, xl
+	sll	xl, xl
+	j	.Ldiv_xnormalized
+	
+.Ldiv_return_zero:
+	/* Return zero with the appropriate sign bit.  */
+	srli	xh, a7, 31
+	slli	xh, xh, 31
+	movi	xl, 0
+	leaf_return
+
+.Ldiv_xnan_or_inf:
+	/* Set the sign bit of the result.  */
+	srli	a7, yh, 31
+	slli	a7, a7, 31
+	xor	xh, xh, a7
+	/* If y is NaN or Inf, return NaN.  */
+	bnall	yh, a6, 1f
+	movi	a4, 0x80000	/* make it a quiet NaN */
+	or	xh, xh, a4
+1:	leaf_return
+
+.Ldiv_ynan_or_inf:
+	/* If y is Infinity, return zero.  */
+	slli	a8, yh, 12
+	or	a8, a8, yl
+	beqz	a8, .Ldiv_return_zero
+	/* y is NaN; return it.  */
+	mov	xh, yh
+	mov	xl, yl
+	leaf_return
+
+.Ldiv_highequal1:
+	bltu	xl, yl, 2f
+	j	3f
+
+	.align	4
+	.global	__divdf3
+	.type	__divdf3, @function
+__divdf3:
+	leaf_entry sp, 16
+	movi	a6, 0x7ff00000
+
+	/* Get the sign of the result.  */
+	xor	a7, xh, yh
+
+	/* Check for NaN and infinity.  */
+	ball	xh, a6, .Ldiv_xnan_or_inf
+	ball	yh, a6, .Ldiv_ynan_or_inf
+
+	/* Extract the exponents.  */
+	extui	a8, xh, 20, 11
+	extui	a9, yh, 20, 11
+
+	beqz	a9, .Ldiv_yexpzero
+.Ldiv_ynormalized:	
+	beqz	a8, .Ldiv_xexpzero
+.Ldiv_xnormalized:	
+
+	/* Subtract the exponents.  */
+	sub	a8, a8, a9
+
+	/* Replace sign/exponent fields with explicit "1.0".  */
+	movi	a10, 0x1fffff
+	or	xh, xh, a6
+	and	xh, xh, a10
+	or	yh, yh, a6
+	and	yh, yh, a10
+
+	/* Set SAR for left shift by one.  */
+	ssai	(32 - 1)
+
+	/* The first digit of the mantissa division must be a one.
+	   Shift x (and adjust the exponent) as needed to make this true.  */
+	bltu	yh, xh, 3f
+	beq	yh, xh, .Ldiv_highequal1
+2:	src	xh, xh, xl
+	sll	xl, xl
+	addi	a8, a8, -1
+3:
+	/* Do the first subtraction and shift.  */
+	sub	xh, xh, yh
+	bgeu	xl, yl, 1f
+	addi	xh, xh, -1
+1:	sub	xl, xl, yl
+	src	xh, xh, xl
+	sll	xl, xl
+
+	/* Put the quotient into a10/a11.  */
+	movi	a10, 0
+	movi	a11, 1
+
+	/* Divide one bit at a time for 52 bits.  */
+	movi	a9, 52
+#if XCHAL_HAVE_LOOPS
+	loop	a9, .Ldiv_loopend
+#endif
+.Ldiv_loop:
+	/* Shift the quotient << 1.  */
+	src	a10, a10, a11
+	sll	a11, a11
+
+	/* Is this digit a 0 or 1?  */
+	bltu	xh, yh, 3f
+	beq	xh, yh, .Ldiv_highequal2
+
+	/* Output a 1 and subtract.  */
+2:	addi	a11, a11, 1
+	sub	xh, xh, yh
+	bgeu	xl, yl, 1f
+	addi	xh, xh, -1
+1:	sub	xl, xl, yl
+
+	/* Shift the dividend << 1.  */
+3:	src	xh, xh, xl
+	sll	xl, xl
+
+#if !XCHAL_HAVE_LOOPS
+	addi	a9, a9, -1
+	bnez	a9, .Ldiv_loop
+#endif
+.Ldiv_loopend:
+
+	/* Add the exponent bias (less one to account for the explicit "1.0"
+	   of the mantissa that will be added to the exponent in the final
+	   result).  */
+	movi	a9, 0x3fe
+	add	a8, a8, a9
+	
+	/* Check for over/underflow.  The value in a8 is one less than the
+	   final exponent, so values in the range 0..7fd are OK here.  */
+	addmi	a9, a9, 0x400	/* 0x7fe */
+	bgeu	a8, a9, .Ldiv_overflow
+
+.Ldiv_round:
+	/* Round.  The remainder (<< 1) is in xh/xl.  */
+	bltu	xh, yh, .Ldiv_rounded
+	beq	xh, yh, .Ldiv_highequal3
+.Ldiv_roundup:
+	addi	a11, a11, 1
+	beqz	a11, .Ldiv_roundcarry
+
+.Ldiv_rounded:
+	mov	xl, a11
+	/* Add the exponent to the mantissa.  */
+	slli	a8, a8, 20
+	add	xh, a10, a8
+
+.Ldiv_addsign:
+	/* Add the sign bit.  */
+	srli	a7, a7, 31
+	slli	a7, a7, 31
+	or	xh, xh, a7
+	leaf_return
+
+.Ldiv_highequal2:
+	bgeu	xl, yl, 2b
+	j	3b
+
+.Ldiv_highequal3:
+	bltu	xl, yl, .Ldiv_rounded
+	bne	xl, yl, .Ldiv_roundup
+
+	/* Remainder is exactly half the divisor.  Round even.  */
+	addi	a11, a11, 1
+	beqz	a11, .Ldiv_roundcarry
+	srli	a11, a11, 1
+	slli	a11, a11, 1
+	j	.Ldiv_rounded
+
+.Ldiv_overflow:
+	bltz	a8, .Ldiv_underflow
+	/* Return +/- Infinity.  */
+	addi	a8, a9, 1	/* 0x7ff */
+	slli	xh, a8, 20
+	movi	xl, 0
+	j	.Ldiv_addsign
+
+.Ldiv_underflow:
+	/* Create a subnormal value, where the exponent field contains zero,
+	   but the effective exponent is 1.  The value of a8 is one less than
+	   the actual exponent, so just negate it to get the shift amount.  */
+	neg	a8, a8
+	ssr	a8
+	bgeui	a8, 32, .Ldiv_bigshift
+	
+	/* Shift a10/a11 right.  Any bits that are shifted out of a11 are
+	   saved in a6 for rounding the result.  */
+	sll	a6, a11
+	src	a11, a10, a11
+	srl	a10, a10
+	j	1f
+
+.Ldiv_bigshift:
+	bgeui	a8, 64, .Ldiv_flush_to_zero
+	sll	a9, a11		/* lost bits shifted out of a11 */
+	src	a6, a10, a11
+	srl	a11, a10
+	movi	a10, 0
+	or	xl, xl, a9
+
+	/* Set the exponent to zero.  */
+1:	movi	a8, 0
+
+	/* Pack any nonzero remainder (in xh/xl) into a6.  */
+	or	xh, xh, xl
+	beqz	xh, 1f
+	movi	a9, 1
+	or	a6, a6, a9
+	
+	/* Round a10/a11 based on the bits shifted out into a6.  */
+1:	bgez	a6, .Ldiv_rounded
+	addi	a11, a11, 1
+	beqz	a11, .Ldiv_roundcarry
+	slli	a6, a6, 1
+	bnez	a6, .Ldiv_rounded
+	srli	a11, a11, 1
+	slli	a11, a11, 1
+	j	.Ldiv_rounded
+
+.Ldiv_roundcarry:
+	/* a11 is always zero when the rounding increment overflows, so
+	   there's no need to round it to an even value.  */
+	addi	a10, a10, 1
+	/* Overflow to the exponent field is OK.  */
+	j	.Ldiv_rounded
+
+.Ldiv_flush_to_zero:
+	/* Return zero with the appropriate sign bit.  */
+	srli	xh, a7, 31
+	slli	xh, xh, 31
+	movi	xl, 0
+	leaf_return
+
+#endif /* L_divdf3 */
+
+#ifdef L_cmpdf2
+
+	/* Equal and Not Equal */
+
+	.align	4
+	.global	__eqdf2
+	.global	__nedf2
+	.set	__nedf2, __eqdf2
+	.type	__eqdf2, @function
+__eqdf2:
+	leaf_entry sp, 16
+	bne	xl, yl, 2f
+	bne	xh, yh, 4f
+
+	/* The values are equal but NaN != NaN.  Check the exponent.  */
+	movi	a6, 0x7ff00000
+	ball	xh, a6, 3f
+
+	/* Equal.  */
+	movi	a2, 0
+	leaf_return
+
+	/* Not equal.  */
+2:	movi	a2, 1
+	leaf_return
+
+	/* Check if the mantissas are nonzero.  */
+3:	slli	a7, xh, 12
+	or	a7, a7, xl
+	j	5f
+
+	/* Check if x and y are zero with different signs.  */
+4:	or	a7, xh, yh
+	slli	a7, a7, 1
+	or	a7, a7, xl	/* xl == yl here */
+
+	/* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa
+	   or x when exponent(x) = 0x7ff and x == y.  */
+5:	movi	a2, 0
+	movi	a3, 1
+	movnez	a2, a3, a7	
+	leaf_return
+
+
+	/* Greater Than */
+
+	.align	4
+	.global	__gtdf2
+	.type	__gtdf2, @function
+__gtdf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7ff00000
+	ball	xh, a6, 2f
+1:	bnall	yh, a6, .Lle_cmp
+
+	/* Check if y is a NaN.  */
+	slli	a7, yh, 12
+	or	a7, a7, yl
+	beqz	a7, .Lle_cmp
+	movi	a2, 0
+	leaf_return
+
+	/* Check if x is a NaN.  */
+2:	slli	a7, xh, 12
+	or	a7, a7, xl
+	beqz	a7, 1b
+	movi	a2, 0
+	leaf_return
+
+
+	/* Less Than or Equal */
+
+	.align	4
+	.global	__ledf2
+	.type	__ledf2, @function
+__ledf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7ff00000
+	ball	xh, a6, 2f
+1:	bnall	yh, a6, .Lle_cmp
+
+	/* Check if y is a NaN.  */
+	slli	a7, yh, 12
+	or	a7, a7, yl
+	beqz	a7, .Lle_cmp
+	movi	a2, 1
+	leaf_return
+
+	/* Check if x is a NaN.  */
+2:	slli	a7, xh, 12
+	or	a7, a7, xl
+	beqz	a7, 1b
+	movi	a2, 1
+	leaf_return
+
+.Lle_cmp:
+	/* Check if x and y have different signs.  */
+	xor	a7, xh, yh
+	bltz	a7, .Lle_diff_signs
+
+	/* Check if x is negative.  */
+	bltz	xh, .Lle_xneg
+
+	/* Check if x <= y.  */
+	bltu	xh, yh, 4f
+	bne	xh, yh, 5f
+	bltu	yl, xl, 5f
+4:	movi	a2, 0
+	leaf_return
+
+.Lle_xneg:
+	/* Check if y <= x.  */
+	bltu	yh, xh, 4b
+	bne	yh, xh, 5f
+	bgeu	xl, yl, 4b
+5:	movi	a2, 1
+	leaf_return
+
+.Lle_diff_signs:
+	bltz	xh, 4b
+
+	/* Check if both x and y are zero.  */
+	or	a7, xh, yh
+	slli	a7, a7, 1
+	or	a7, a7, xl
+	or	a7, a7, yl
+	movi	a2, 1
+	movi	a3, 0
+	moveqz	a2, a3, a7
+	leaf_return
+
+
+	/* Greater Than or Equal */
+
+	.align	4
+	.global	__gedf2
+	.type	__gedf2, @function
+__gedf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7ff00000
+	ball	xh, a6, 2f
+1:	bnall	yh, a6, .Llt_cmp
+
+	/* Check if y is a NaN.  */
+	slli	a7, yh, 12
+	or	a7, a7, yl
+	beqz	a7, .Llt_cmp
+	movi	a2, -1
+	leaf_return
+
+	/* Check if x is a NaN.  */
+2:	slli	a7, xh, 12
+	or	a7, a7, xl
+	beqz	a7, 1b
+	movi	a2, -1
+	leaf_return
+
+
+	/* Less Than */
+
+	.align	4
+	.global	__ltdf2
+	.type	__ltdf2, @function
+__ltdf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7ff00000
+	ball	xh, a6, 2f
+1:	bnall	yh, a6, .Llt_cmp
+
+	/* Check if y is a NaN.  */
+	slli	a7, yh, 12
+	or	a7, a7, yl
+	beqz	a7, .Llt_cmp
+	movi	a2, 0
+	leaf_return
+
+	/* Check if x is a NaN.  */
+2:	slli	a7, xh, 12
+	or	a7, a7, xl
+	beqz	a7, 1b
+	movi	a2, 0
+	leaf_return
+
+.Llt_cmp:
+	/* Check if x and y have different signs.  */
+	xor	a7, xh, yh
+	bltz	a7, .Llt_diff_signs
+
+	/* Check if x is negative.  */
+	bltz	xh, .Llt_xneg
+
+	/* Check if x < y.  */
+	bltu	xh, yh, 4f
+	bne	xh, yh, 5f
+	bgeu	xl, yl, 5f
+4:	movi	a2, -1
+	leaf_return
+
+.Llt_xneg:
+	/* Check if y < x.  */
+	bltu	yh, xh, 4b
+	bne	yh, xh, 5f
+	bltu	yl, xl, 4b
+5:	movi	a2, 0
+	leaf_return
+
+.Llt_diff_signs:
+	bgez	xh, 5b
+
+	/* Check if both x and y are nonzero.  */
+	or	a7, xh, yh
+	slli	a7, a7, 1
+	or	a7, a7, xl
+	or	a7, a7, yl
+	movi	a2, 0
+	movi	a3, -1
+	movnez	a2, a3, a7
+	leaf_return
+
+
+	/* Unordered */
+
+	.align	4
+	.global	__unorddf2
+	.type	__unorddf2, @function
+__unorddf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7ff00000
+	ball	xh, a6, 3f
+1:	ball	yh, a6, 4f
+2:	movi	a2, 0
+	leaf_return
+
+3:	slli	a7, xh, 12
+	or	a7, a7, xl
+	beqz	a7, 1b
+	movi	a2, 1
+	leaf_return
+
+4:	slli	a7, yh, 12
+	or	a7, a7, yl
+	beqz	a7, 2b
+	movi	a2, 1
+	leaf_return
+
+#endif /* L_cmpdf2 */
+
+#ifdef L_fixdfsi
+
+	.align	4
+	.global	__fixdfsi
+	.type	__fixdfsi, @function
+__fixdfsi:
+	leaf_entry sp, 16
+
+	/* Check for NaN and Infinity.  */
+	movi	a6, 0x7ff00000
+	ball	xh, a6, .Lfixdfsi_nan_or_inf
+
+	/* Extract the exponent and check if 0 < (exp - 0x3fe) < 32.  */
+	extui	a4, xh, 20, 11
+	extui	a5, a6, 19, 10	/* 0x3fe */
+	sub	a4, a4, a5
+	bgei	a4, 32, .Lfixdfsi_maxint
+	blti	a4, 1, .Lfixdfsi_zero
+
+	/* Add explicit "1.0" and shift << 11.  */
+	or	a7, xh, a6
+	ssai	(32 - 11)
+	src	a5, a7, xl
+
+	/* Shift back to the right, based on the exponent.  */
+	ssl	a4		/* shift by 32 - a4 */
+	srl	a5, a5
+
+	/* Negate the result if sign != 0.  */
+	neg	a2, a5
+	movgez	a2, a5, a7
+	leaf_return
+
+.Lfixdfsi_nan_or_inf:
+	/* Handle Infinity and NaN.  */
+	slli	a4, xh, 12
+	or	a4, a4, xl
+	beqz	a4, .Lfixdfsi_maxint
+
+	/* Translate NaN to +maxint.  */
+	movi	xh, 0
+
+.Lfixdfsi_maxint:
+	slli	a4, a6, 11	/* 0x80000000 */
+	addi	a5, a4, -1	/* 0x7fffffff */
+	movgez	a4, a5, xh
+	mov	a2, a4
+	leaf_return
+
+.Lfixdfsi_zero:
+	movi	a2, 0
+	leaf_return
+
+#endif /* L_fixdfsi */
+
+#ifdef L_fixdfdi
+
+	.align	4
+	.global	__fixdfdi
+	.type	__fixdfdi, @function
+__fixdfdi:
+	leaf_entry sp, 16
+
+	/* Check for NaN and Infinity.  */
+	movi	a6, 0x7ff00000
+	ball	xh, a6, .Lfixdfdi_nan_or_inf
+
+	/* Extract the exponent and check if 0 < (exp - 0x3fe) < 64.  */
+	extui	a4, xh, 20, 11
+	extui	a5, a6, 19, 10	/* 0x3fe */
+	sub	a4, a4, a5
+	bgei	a4, 64, .Lfixdfdi_maxint
+	blti	a4, 1, .Lfixdfdi_zero
+
+	/* Add explicit "1.0" and shift << 11.  */
+	or	a7, xh, a6
+	ssai	(32 - 11)
+	src	xh, a7, xl
+	sll	xl, xl
+
+	/* Shift back to the right, based on the exponent.  */
+	ssl	a4		/* shift by 64 - a4 */
+	bgei	a4, 32, .Lfixdfdi_smallshift
+	srl	xl, xh
+	movi	xh, 0
+
+.Lfixdfdi_shifted:	
+	/* Negate the result if sign != 0.  */
+	bgez	a7, 1f
+	neg	xl, xl
+	neg	xh, xh
+	beqz	xl, 1f
+	addi	xh, xh, -1
+1:	leaf_return
+
+.Lfixdfdi_smallshift:
+	src	xl, xh, xl
+	srl	xh, xh
+	j	.Lfixdfdi_shifted
+
+.Lfixdfdi_nan_or_inf:
+	/* Handle Infinity and NaN.  */
+	slli	a4, xh, 12
+	or	a4, a4, xl
+	beqz	a4, .Lfixdfdi_maxint
+
+	/* Translate NaN to +maxint.  */
+	movi	xh, 0
+
+.Lfixdfdi_maxint:
+	slli	a7, a6, 11	/* 0x80000000 */
+	bgez	xh, 1f
+	mov	xh, a7
+	movi	xl, 0
+	leaf_return
+
+1:	addi	xh, a7, -1	/* 0x7fffffff */
+	movi	xl, -1
+	leaf_return
+
+.Lfixdfdi_zero:
+	movi	xh, 0
+	movi	xl, 0
+	leaf_return
+
+#endif /* L_fixdfdi */
+
+#ifdef L_fixunsdfsi
+
+	.align	4
+	.global	__fixunsdfsi
+	.type	__fixunsdfsi, @function
+__fixunsdfsi:
+	leaf_entry sp, 16
+
+	/* Check for NaN and Infinity.  */
+	movi	a6, 0x7ff00000
+	ball	xh, a6, .Lfixunsdfsi_nan_or_inf
+
+	/* Extract the exponent and check if 0 <= (exp - 0x3ff) < 32.  */
+	extui	a4, xh, 20, 11
+	extui	a5, a6, 20, 10	/* 0x3ff */
+	sub	a4, a4, a5
+	bgei	a4, 32, .Lfixunsdfsi_maxint
+	bltz	a4, .Lfixunsdfsi_zero
+
+	/* Add explicit "1.0" and shift << 11.  */
+	or	a7, xh, a6
+	ssai	(32 - 11)
+	src	a5, a7, xl
+
+	/* Shift back to the right, based on the exponent.  */
+	addi	a4, a4, 1
+	beqi	a4, 32, .Lfixunsdfsi_bigexp
+	ssl	a4		/* shift by 32 - a4 */
+	srl	a5, a5
+
+	/* Negate the result if sign != 0.  */
+	neg	a2, a5
+	movgez	a2, a5, a7
+	leaf_return
+
+.Lfixunsdfsi_nan_or_inf:
+	/* Handle Infinity and NaN.  */
+	slli	a4, xh, 12
+	or	a4, a4, xl
+	beqz	a4, .Lfixunsdfsi_maxint
+
+	/* Translate NaN to 0xffffffff.  */
+	movi	a2, -1
+	leaf_return
+
+.Lfixunsdfsi_maxint:
+	slli	a4, a6, 11	/* 0x80000000 */
+	movi	a5, -1		/* 0xffffffff */
+	movgez	a4, a5, xh
+	mov	a2, a4
+	leaf_return
+
+.Lfixunsdfsi_zero:
+	movi	a2, 0
+	leaf_return
+
+.Lfixunsdfsi_bigexp:
+	/* Handle unsigned maximum exponent case.  */
+	bltz	xh, 1f
+	mov	a2, a5		/* no shift needed */
+	leaf_return
+
+	/* Return 0x80000000 if negative.  */
+1:	slli	a2, a6, 11
+	leaf_return
+
+#endif /* L_fixunsdfsi */
+
+#ifdef L_fixunsdfdi
+
+	.align	4
+	.global	__fixunsdfdi
+	.type	__fixunsdfdi, @function
+__fixunsdfdi:
+	leaf_entry sp, 16
+
+	/* Check for NaN and Infinity.  */
+	movi	a6, 0x7ff00000
+	ball	xh, a6, .Lfixunsdfdi_nan_or_inf
+
+	/* Extract the exponent and check if 0 <= (exp - 0x3ff) < 64.  */
+	extui	a4, xh, 20, 11
+	extui	a5, a6, 20, 10	/* 0x3ff */
+	sub	a4, a4, a5
+	bgei	a4, 64, .Lfixunsdfdi_maxint
+	bltz	a4, .Lfixunsdfdi_zero
+
+	/* Add explicit "1.0" and shift << 11.  */
+	or	a7, xh, a6
+	ssai	(32 - 11)
+	src	xh, a7, xl
+	sll	xl, xl
+
+	/* Shift back to the right, based on the exponent.  */
+	addi	a4, a4, 1
+	beqi	a4, 64, .Lfixunsdfdi_bigexp
+	ssl	a4		/* shift by 64 - a4 */
+	bgei	a4, 32, .Lfixunsdfdi_smallshift
+	srl	xl, xh
+	movi	xh, 0
+
+.Lfixunsdfdi_shifted:
+	/* Negate the result if sign != 0.  */
+	bgez	a7, 1f
+	neg	xl, xl
+	neg	xh, xh
+	beqz	xl, 1f
+	addi	xh, xh, -1
+1:	leaf_return
+
+.Lfixunsdfdi_smallshift:
+	src	xl, xh, xl
+	srl	xh, xh
+	j	.Lfixunsdfdi_shifted
+
+.Lfixunsdfdi_nan_or_inf:
+	/* Handle Infinity and NaN.  */
+	slli	a4, xh, 12
+	or	a4, a4, xl
+	beqz	a4, .Lfixunsdfdi_maxint
+
+	/* Translate NaN to 0xffffffff.... */
+1:	movi	xh, -1
+	movi	xl, -1
+	leaf_return
+
+.Lfixunsdfdi_maxint:
+	bgez	xh, 1b
+2:	slli	xh, a6, 11	/* 0x80000000 */
+	movi	xl, 0
+	leaf_return
+
+.Lfixunsdfdi_zero:
+	movi	xh, 0
+	movi	xl, 0
+	leaf_return
+
+.Lfixunsdfdi_bigexp:
+	/* Handle unsigned maximum exponent case.  */
+	bltz	a7, 2b
+	leaf_return		/* no shift needed */
+
+#endif /* L_fixunsdfdi */
+
+#ifdef L_floatsidf
+
+	.align	4
+	.global	__floatunsidf
+	.type	__floatunsidf, @function
+__floatunsidf:
+	leaf_entry sp, 16
+	beqz	a2, .Lfloatsidf_return_zero
+
+	/* Set the sign to zero and jump to the floatsidf code.  */
+	movi	a7, 0
+	j	.Lfloatsidf_normalize
+
+	.align	4
+	.global	__floatsidf
+	.type	__floatsidf, @function
+__floatsidf:
+	leaf_entry sp, 16
+
+	/* Check for zero.  */
+	beqz	a2, .Lfloatsidf_return_zero
+
+	/* Save the sign.  */
+	extui	a7, a2, 31, 1
+
+	/* Get the absolute value.  */
+#if XCHAL_HAVE_ABS
+	abs	a2, a2
+#else
+	neg	a4, a2
+	movltz	a2, a4, a2
+#endif
+
+.Lfloatsidf_normalize:
+	/* Normalize with the first 1 bit in the msb.  */
+	do_nsau	a4, a2, a5, a6
+	ssl	a4
+	sll	a5, a2
+
+	/* Shift the mantissa into position.  */
+	srli	xh, a5, 11
+	slli	xl, a5, (32 - 11)
+
+	/* Set the exponent.  */
+	movi	a5, 0x41d	/* 0x3fe + 31 */
+	sub	a5, a5, a4
+	slli	a5, a5, 20
+	add	xh, xh, a5
+
+	/* Add the sign and return. */
+	slli	a7, a7, 31
+	or	xh, xh, a7
+	leaf_return
+
+.Lfloatsidf_return_zero:
+	movi	a3, 0
+	leaf_return
+
+#endif /* L_floatsidf */
+
+#ifdef L_floatdidf
+
+	.align	4
+	.global	__floatundidf
+	.type	__floatundidf, @function
+__floatundidf:
+	leaf_entry sp, 16
+
+	/* Check for zero.  */
+	or	a4, xh, xl
+	beqz	a4, 2f
+
+	/* Set the sign to zero and jump to the floatdidf code.  */
+	movi	a7, 0
+	j	.Lfloatdidf_normalize
+
+	.align	4
+	.global	__floatdidf
+	.type	__floatdidf, @function
+__floatdidf:
+	leaf_entry sp, 16
+
+	/* Check for zero.  */
+	or	a4, xh, xl
+	beqz	a4, 2f
+
+	/* Save the sign.  */
+	extui	a7, xh, 31, 1
+
+	/* Get the absolute value.  */
+	bgez	xh, .Lfloatdidf_normalize
+	neg	xl, xl
+	neg	xh, xh
+	beqz	xl, .Lfloatdidf_normalize
+	addi	xh, xh, -1
+
+.Lfloatdidf_normalize:
+	/* Normalize with the first 1 bit in the msb of xh.  */
+	beqz	xh, .Lfloatdidf_bigshift
+	do_nsau	a4, xh, a5, a6
+	ssl	a4
+	src	xh, xh, xl
+	sll	xl, xl
+
+.Lfloatdidf_shifted:
+	/* Shift the mantissa into position, with rounding bits in a6.  */
+	ssai	11
+	sll	a6, xl
+	src	xl, xh, xl
+	srl	xh, xh
+
+	/* Set the exponent.  */
+	movi	a5, 0x43d	/* 0x3fe + 63 */
+	sub	a5, a5, a4
+	slli	a5, a5, 20
+	add	xh, xh, a5
+
+	/* Add the sign.  */
+	slli	a7, a7, 31
+	or	xh, xh, a7
+
+	/* Round up if the leftover fraction is >= 1/2.  */
+	bgez	a6, 2f
+	addi	xl, xl, 1
+	beqz	xl, .Lfloatdidf_roundcarry
+
+	/* Check if the leftover fraction is exactly 1/2.  */
+	slli	a6, a6, 1
+	beqz	a6, .Lfloatdidf_exactlyhalf
+2:	leaf_return
+
+.Lfloatdidf_bigshift:
+	/* xh is zero.  Normalize with first 1 bit of xl in the msb of xh.  */
+	do_nsau	a4, xl, a5, a6
+	ssl	a4
+	sll	xh, xl
+	movi	xl, 0
+	addi	a4, a4, 32
+	j	.Lfloatdidf_shifted
+
+.Lfloatdidf_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	xl, xl, 1
+	slli	xl, xl, 1
+	leaf_return
+
+.Lfloatdidf_roundcarry:
+	/* xl is always zero when the rounding increment overflows, so
+	   there's no need to round it to an even value.  */
+	addi	xh, xh, 1
+	/* Overflow to the exponent is OK.  */
+	leaf_return
+
+#endif /* L_floatdidf */
+
+#ifdef L_truncdfsf2
+
+	.align	4
+	.global	__truncdfsf2
+	.type	__truncdfsf2, @function
+__truncdfsf2:
+	leaf_entry sp, 16
+
+	/* Adjust the exponent bias.  */
+	movi	a4, (0x3ff - 0x7f) << 20
+	sub	a5, xh, a4
+
+	/* Check for underflow.  */
+	xor	a6, xh, a5
+	bltz	a6, .Ltrunc_underflow
+	extui	a6, a5, 20, 11
+	beqz	a6, .Ltrunc_underflow
+
+	/* Check for overflow.  */
+	movi	a4, 255
+	bge	a6, a4, .Ltrunc_overflow
+
+	/* Shift a5/xl << 3 into a5/a4.  */
+	ssai	(32 - 3)
+	src	a5, a5, xl
+	sll	a4, xl
+
+.Ltrunc_addsign:
+	/* Add the sign bit.  */
+	extui	a6, xh, 31, 1
+	slli	a6, a6, 31
+	or	a2, a6, a5
+
+	/* Round up if the leftover fraction is >= 1/2.  */
+	bgez	a4, 1f
+	addi	a2, a2, 1
+	/* Overflow to the exponent is OK.  The answer will be correct.  */
+
+	/* Check if the leftover fraction is exactly 1/2.  */
+	slli	a4, a4, 1
+	beqz	a4, .Ltrunc_exactlyhalf
+1:	leaf_return
+
+.Ltrunc_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	a2, a2, 1
+	slli	a2, a2, 1
+	leaf_return
+
+.Ltrunc_overflow:
+	/* Check if exponent == 0x7ff.  */
+	movi	a4, 0x7ff00000
+	bnall	xh, a4, 1f
+
+	/* Check if mantissa is nonzero.  */
+	slli	a5, xh, 12
+	or	a5, a5, xl
+	beqz	a5, 1f
+
+	/* Shift a4 to set a bit in the mantissa, making a quiet NaN.  */
+	srli	a4, a4, 1
+
+1:	slli	a4, a4, 4	/* 0xff000000 or 0xff800000 */
+	/* Add the sign bit.  */
+	extui	a6, xh, 31, 1
+	ssai	1
+	src	a2, a6, a4
+	leaf_return
+
+.Ltrunc_underflow:
+	/* Find shift count for a subnormal.  Flush to zero if >= 32.  */
+	extui	a6, xh, 20, 11
+	movi	a5, 0x3ff - 0x7f
+	sub	a6, a5, a6
+	addi	a6, a6, 1
+	bgeui	a6, 32, 1f
+
+	/* Replace the exponent with an explicit "1.0".  */
+	slli	a5, a5, 13	/* 0x700000 */
+	or	a5, a5, xh
+	slli	a5, a5, 11
+	srli	a5, a5, 11
+
+	/* Shift the mantissa left by 3 bits (into a5/a4).  */
+	ssai	(32 - 3)
+	src	a5, a5, xl
+	sll	a4, xl
+
+	/* Shift right by a6.  */
+	ssr	a6
+	sll	a7, a4
+	src	a4, a5, a4
+	srl	a5, a5
+	beqz	a7, .Ltrunc_addsign
+	or	a4, a4, a6	/* any positive, nonzero value will work */
+	j	.Ltrunc_addsign
+
+	/* Return +/- zero.  */
+1:	extui	a2, xh, 31, 1
+	slli	a2, a2, 31
+	leaf_return
+
+#endif /* L_truncdfsf2 */
+
+#ifdef L_extendsfdf2
+
+	.align	4
+	.global	__extendsfdf2
+	.type	__extendsfdf2, @function
+__extendsfdf2:
+	leaf_entry sp, 16
+
+	/* Save the sign bit and then shift it off.  */
+	extui	a5, a2, 31, 1
+	slli	a5, a5, 31
+	slli	a4, a2, 1
+
+	/* Extract and check the exponent.  */
+	extui	a6, a2, 23, 8
+	beqz	a6, .Lextend_expzero
+	addi	a6, a6, 1
+	beqi	a6, 256, .Lextend_nan_or_inf
+
+	/* Shift >> 3 into a4/xl.  */
+	srli	a4, a4, 4
+	slli	xl, a2, (32 - 3)
+
+	/* Adjust the exponent bias.  */
+	movi	a6, (0x3ff - 0x7f) << 20
+	add	a4, a4, a6
+
+	/* Add the sign bit.  */
+	or	xh, a4, a5
+	leaf_return
+
+.Lextend_nan_or_inf:
+	movi	a4, 0x7ff00000
+
+	/* Check for NaN.  */
+	slli	a7, a2, 9
+	beqz	a7, 1f
+
+	slli	a6, a6, 11	/* 0x80000 */
+	or	a4, a4, a6
+
+	/* Add the sign and return.  */
+1:	or	xh, a4, a5
+	movi	xl, 0
+	leaf_return
+
+.Lextend_expzero:
+	beqz	a4, 1b
+
+	/* Normalize it to have 8 zero bits before the first 1 bit.  */
+	do_nsau	a7, a4, a2, a3
+	addi	a7, a7, -8
+	ssl	a7
+	sll	a4, a4
+	
+	/* Shift >> 3 into a4/xl.  */
+	slli	xl, a4, (32 - 3)
+	srli	a4, a4, 3
+
+	/* Set the exponent.  */
+	movi	a6, 0x3fe - 0x7f
+	sub	a6, a6, a7
+	slli	a6, a6, 20
+	add	a4, a4, a6
+
+	/* Add the sign and return.  */
+	or	xh, a4, a5
+	leaf_return
+
+#endif /* L_extendsfdf2 */
+
+
diff --git a/libgcc/config/xtensa/ieee754-sf.S b/libgcc/config/xtensa/ieee754-sf.S
new file mode 100644
index 00000000000..d75be0e5ae5
--- /dev/null
+++ b/libgcc/config/xtensa/ieee754-sf.S
@@ -0,0 +1,1757 @@
+/* IEEE-754 single-precision functions for Xtensa
+   Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc.
+   Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef __XTENSA_EB__
+#define xh a2
+#define xl a3
+#define yh a4
+#define yl a5
+#else
+#define xh a3
+#define xl a2
+#define yh a5
+#define yl a4
+#endif
+
+/*  Warning!  The branch displacements for some Xtensa branch instructions
+    are quite small, and this code has been carefully laid out to keep
+    branch targets in range.  If you change anything, be sure to check that
+    the assembler is not relaxing anything to branch over a jump.  */
+
+#ifdef L_negsf2
+
+	.align	4
+	.global	__negsf2
+	.type	__negsf2, @function
+__negsf2:
+	leaf_entry sp, 16
+	movi	a4, 0x80000000
+	xor	a2, a2, a4
+	leaf_return
+
+#endif /* L_negsf2 */
+
+#ifdef L_addsubsf3
+
+	/* Addition */
+__addsf3_aux:
+
+	/* Handle NaNs and Infinities.  (This code is placed before the
+	   start of the function just to keep it in range of the limited
+	   branch displacements.)  */
+
+.Ladd_xnan_or_inf:
+	/* If y is neither Infinity nor NaN, return x.  */
+	bnall	a3, a6, 1f
+	/* If x is a NaN, return it.  Otherwise, return y.  */
+	slli	a7, a2, 9
+	beqz	a7, .Ladd_ynan_or_inf
+1:	leaf_return
+
+.Ladd_ynan_or_inf:
+	/* Return y.  */
+	mov	a2, a3
+	leaf_return
+
+.Ladd_opposite_signs:
+	/* Operand signs differ.  Do a subtraction.  */
+	slli	a7, a6, 8
+	xor	a3, a3, a7
+	j	.Lsub_same_sign
+
+	.align	4
+	.global	__addsf3
+	.type	__addsf3, @function
+__addsf3:
+	leaf_entry sp, 16
+	movi	a6, 0x7f800000
+
+	/* Check if the two operands have the same sign.  */
+	xor	a7, a2, a3
+	bltz	a7, .Ladd_opposite_signs
+
+.Ladd_same_sign:	
+	/* Check if either exponent == 0x7f8 (i.e., NaN or Infinity).  */
+	ball	a2, a6, .Ladd_xnan_or_inf
+	ball	a3, a6, .Ladd_ynan_or_inf
+
+	/* Compare the exponents.  The smaller operand will be shifted
+	   right by the exponent difference and added to the larger
+	   one.  */
+	extui	a7, a2, 23, 9
+	extui	a8, a3, 23, 9
+	bltu	a7, a8, .Ladd_shiftx
+
+.Ladd_shifty:
+	/* Check if the smaller (or equal) exponent is zero.  */
+	bnone	a3, a6, .Ladd_yexpzero
+
+	/* Replace y sign/exponent with 0x008.  */
+	or	a3, a3, a6
+	slli	a3, a3, 8
+	srli	a3, a3, 8
+
+.Ladd_yexpdiff:
+	/* Compute the exponent difference.  */
+	sub	a10, a7, a8
+
+	/* Exponent difference > 32 -- just return the bigger value.  */
+	bgeui	a10, 32, 1f
+	
+	/* Shift y right by the exponent difference.  Any bits that are
+	   shifted out of y are saved in a9 for rounding the result.  */
+	ssr	a10
+	movi	a9, 0
+	src	a9, a3, a9
+	srl	a3, a3
+
+	/* Do the addition.  */
+	add	a2, a2, a3
+
+	/* Check if the add overflowed into the exponent.  */
+	extui	a10, a2, 23, 9
+	beq	a10, a7, .Ladd_round
+	mov	a8, a7
+	j	.Ladd_carry
+
+.Ladd_yexpzero:
+	/* y is a subnormal value.  Replace its sign/exponent with zero,
+	   i.e., no implicit "1.0", and increment the apparent exponent
+	   because subnormals behave as if they had the minimum (nonzero)
+	   exponent.  Test for the case when both exponents are zero.  */
+	slli	a3, a3, 9
+	srli	a3, a3, 9
+	bnone	a2, a6, .Ladd_bothexpzero
+	addi	a8, a8, 1
+	j	.Ladd_yexpdiff
+
+.Ladd_bothexpzero:
+	/* Both exponents are zero.  Handle this as a special case.  There
+	   is no need to shift or round, and the normal code for handling
+	   a carry into the exponent field will not work because it
+	   assumes there is an implicit "1.0" that needs to be added.  */
+	add	a2, a2, a3
+1:	leaf_return
+
+.Ladd_xexpzero:
+	/* Same as "yexpzero" except skip handling the case when both
+	   exponents are zero.  */
+	slli	a2, a2, 9
+	srli	a2, a2, 9
+	addi	a7, a7, 1
+	j	.Ladd_xexpdiff
+
+.Ladd_shiftx:
+	/* Same thing as the "shifty" code, but with x and y swapped.  Also,
+	   because the exponent difference is always nonzero in this version,
+	   the shift sequence can use SLL and skip loading a constant zero.  */
+	bnone	a2, a6, .Ladd_xexpzero
+
+	or	a2, a2, a6
+	slli	a2, a2, 8
+	srli	a2, a2, 8
+
+.Ladd_xexpdiff:
+	sub	a10, a8, a7
+	bgeui	a10, 32, .Ladd_returny
+	
+	ssr	a10
+	sll	a9, a2
+	srl	a2, a2
+
+	add	a2, a2, a3
+
+	/* Check if the add overflowed into the exponent.  */
+	extui	a10, a2, 23, 9
+	bne	a10, a8, .Ladd_carry
+
+.Ladd_round:
+	/* Round up if the leftover fraction is >= 1/2.  */
+	bgez	a9, 1f
+	addi	a2, a2, 1
+
+	/* Check if the leftover fraction is exactly 1/2.  */
+	slli	a9, a9, 1
+	beqz	a9, .Ladd_exactlyhalf
+1:	leaf_return
+
+.Ladd_returny:
+	mov	a2, a3
+	leaf_return
+
+.Ladd_carry:	
+	/* The addition has overflowed into the exponent field, so the
+	   value needs to be renormalized.  The mantissa of the result
+	   can be recovered by subtracting the original exponent and
+	   adding 0x800000 (which is the explicit "1.0" for the
+	   mantissa of the non-shifted operand -- the "1.0" for the
+	   shifted operand was already added).  The mantissa can then
+	   be shifted right by one bit.  The explicit "1.0" of the
+	   shifted mantissa then needs to be replaced by the exponent,
+	   incremented by one to account for the normalizing shift.
+	   It is faster to combine these operations: do the shift first
+	   and combine the additions and subtractions.  If x is the
+	   original exponent, the result is:
+	       shifted mantissa - (x << 22) + (1 << 22) + (x << 23)
+	   or:
+	       shifted mantissa + ((x + 1) << 22)
+	   Note that the exponent is incremented here by leaving the
+	   explicit "1.0" of the mantissa in the exponent field.  */
+
+	/* Shift x right by one bit.  Save the lsb.  */
+	mov	a10, a2
+	srli	a2, a2, 1
+
+	/* See explanation above.  The original exponent is in a8.  */
+	addi	a8, a8, 1
+	slli	a8, a8, 22
+	add	a2, a2, a8
+
+	/* Return an Infinity if the exponent overflowed.  */
+	ball	a2, a6, .Ladd_infinity
+	
+	/* Same thing as the "round" code except the msb of the leftover
+	   fraction is bit 0 of a10, with the rest of the fraction in a9.  */
+	bbci.l	a10, 0, 1f
+	addi	a2, a2, 1
+	beqz	a9, .Ladd_exactlyhalf
+1:	leaf_return
+
+.Ladd_infinity:
+	/* Clear the mantissa.  */
+	srli	a2, a2, 23
+	slli	a2, a2, 23
+
+	/* The sign bit may have been lost in a carry-out.  Put it back.  */
+	slli	a8, a8, 1
+	or	a2, a2, a8
+	leaf_return
+
+.Ladd_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	a2, a2, 1
+	slli	a2, a2, 1
+	leaf_return
+
+
+	/* Subtraction */
+__subsf3_aux:
+	
+	/* Handle NaNs and Infinities.  (This code is placed before the
+	   start of the function just to keep it in range of the limited
+	   branch displacements.)  */
+
+.Lsub_xnan_or_inf:
+	/* If y is neither Infinity nor NaN, return x.  */
+	bnall	a3, a6, 1f
+	/* Both x and y are either NaN or Inf, so the result is NaN.  */
+	movi	a4, 0x400000	/* make it a quiet NaN */
+	or	a2, a2, a4
+1:	leaf_return
+
+.Lsub_ynan_or_inf:
+	/* Negate y and return it.  */
+	slli	a7, a6, 8
+	xor	a2, a3, a7
+	leaf_return
+
+.Lsub_opposite_signs:
+	/* Operand signs differ.  Do an addition.  */
+	slli	a7, a6, 8
+	xor	a3, a3, a7
+	j	.Ladd_same_sign
+
+	.align	4
+	.global	__subsf3
+	.type	__subsf3, @function
+__subsf3:
+	leaf_entry sp, 16
+	movi	a6, 0x7f800000
+
+	/* Check if the two operands have the same sign.  */
+	xor	a7, a2, a3
+	bltz	a7, .Lsub_opposite_signs
+
+.Lsub_same_sign:	
+	/* Check if either exponent == 0x7f8 (i.e., NaN or Infinity).  */
+	ball	a2, a6, .Lsub_xnan_or_inf
+	ball	a3, a6, .Lsub_ynan_or_inf
+
+	/* Compare the operands.  In contrast to addition, the entire
+	   value matters here.  */
+	extui	a7, a2, 23, 8
+	extui	a8, a3, 23, 8
+	bltu	a2, a3, .Lsub_xsmaller
+
+.Lsub_ysmaller:
+	/* Check if the smaller (or equal) exponent is zero.  */
+	bnone	a3, a6, .Lsub_yexpzero
+
+	/* Replace y sign/exponent with 0x008.  */
+	or	a3, a3, a6
+	slli	a3, a3, 8
+	srli	a3, a3, 8
+
+.Lsub_yexpdiff:
+	/* Compute the exponent difference.  */
+	sub	a10, a7, a8
+
+	/* Exponent difference > 32 -- just return the bigger value.  */
+	bgeui	a10, 32, 1f
+	
+	/* Shift y right by the exponent difference.  Any bits that are
+	   shifted out of y are saved in a9 for rounding the result.  */
+	ssr	a10
+	movi	a9, 0
+	src	a9, a3, a9
+	srl	a3, a3
+
+	sub	a2, a2, a3
+
+	/* Subtract the leftover bits in a9 from zero and propagate any
+	   borrow from a2.  */
+	neg	a9, a9
+	addi	a10, a2, -1
+	movnez	a2, a10, a9
+
+	/* Check if the subtract underflowed into the exponent.  */
+	extui	a10, a2, 23, 8
+	beq	a10, a7, .Lsub_round
+	j	.Lsub_borrow
+
+.Lsub_yexpzero:
+	/* Return zero if the inputs are equal.  (For the non-subnormal
+	   case, subtracting the "1.0" will cause a borrow from the exponent
+	   and this case can be detected when handling the borrow.)  */
+	beq	a2, a3, .Lsub_return_zero
+
+	/* y is a subnormal value.  Replace its sign/exponent with zero,
+	   i.e., no implicit "1.0".  Unless x is also a subnormal, increment
+	   y's apparent exponent because subnormals behave as if they had
+	   the minimum (nonzero) exponent.  */
+	slli	a3, a3, 9
+	srli	a3, a3, 9
+	bnone	a2, a6, .Lsub_yexpdiff
+	addi	a8, a8, 1
+	j	.Lsub_yexpdiff
+
+.Lsub_returny:
+	/* Negate and return y.  */
+	slli	a7, a6, 8
+	xor	a2, a3, a7
+1:	leaf_return
+
+.Lsub_xsmaller:
+	/* Same thing as the "ysmaller" code, but with x and y swapped and
+	   with y negated.  */
+	bnone	a2, a6, .Lsub_xexpzero
+
+	or	a2, a2, a6
+	slli	a2, a2, 8
+	srli	a2, a2, 8
+
+.Lsub_xexpdiff:
+	sub	a10, a8, a7
+	bgeui	a10, 32, .Lsub_returny
+	
+	ssr	a10
+	movi	a9, 0
+	src	a9, a2, a9
+	srl	a2, a2
+
+	/* Negate y.  */
+	slli	a11, a6, 8
+	xor	a3, a3, a11
+
+	sub	a2, a3, a2
+
+	neg	a9, a9
+	addi	a10, a2, -1
+	movnez	a2, a10, a9
+
+	/* Check if the subtract underflowed into the exponent.  */
+	extui	a10, a2, 23, 8
+	bne	a10, a8, .Lsub_borrow
+
+.Lsub_round:
+	/* Round up if the leftover fraction is >= 1/2.  */
+	bgez	a9, 1f
+	addi	a2, a2, 1
+
+	/* Check if the leftover fraction is exactly 1/2.  */
+	slli	a9, a9, 1
+	beqz	a9, .Lsub_exactlyhalf
+1:	leaf_return
+
+.Lsub_xexpzero:
+	/* Same as "yexpzero".  */
+	beq	a2, a3, .Lsub_return_zero
+	slli	a2, a2, 9
+	srli	a2, a2, 9
+	bnone	a3, a6, .Lsub_xexpdiff
+	addi	a7, a7, 1
+	j	.Lsub_xexpdiff
+
+.Lsub_return_zero:
+	movi	a2, 0
+	leaf_return
+
+.Lsub_borrow:	
+	/* The subtraction has underflowed into the exponent field, so the
+	   value needs to be renormalized.  Shift the mantissa left as
+	   needed to remove any leading zeros and adjust the exponent
+	   accordingly.  If the exponent is not large enough to remove
+	   all the leading zeros, the result will be a subnormal value.  */
+
+	slli	a8, a2, 9
+	beqz	a8, .Lsub_xzero
+	do_nsau	a6, a8, a7, a11
+	srli	a8, a8, 9
+	bge	a6, a10, .Lsub_subnormal
+	addi	a6, a6, 1
+
+.Lsub_normalize_shift:
+	/* Shift the mantissa (a8/a9) left by a6.  */
+	ssl	a6
+	src	a8, a8, a9
+	sll	a9, a9
+
+	/* Combine the shifted mantissa with the sign and exponent,
+	   decrementing the exponent by a6.  (The exponent has already
+	   been decremented by one due to the borrow from the subtraction,
+	   but adding the mantissa will increment the exponent by one.)  */
+	srli	a2, a2, 23
+	sub	a2, a2, a6
+	slli	a2, a2, 23
+	add	a2, a2, a8
+	j	.Lsub_round
+
+.Lsub_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	a2, a2, 1
+	slli	a2, a2, 1
+	leaf_return
+
+.Lsub_xzero:
+	/* If there was a borrow from the exponent, and the mantissa and
+	   guard digits are all zero, then the inputs were equal and the
+	   result should be zero.  */
+	beqz	a9, .Lsub_return_zero
+
+	/* Only the guard digit is nonzero.  Shift by min(24, a10).  */
+	addi	a11, a10, -24
+	movi	a6, 24
+	movltz	a6, a10, a11
+	j	.Lsub_normalize_shift
+
+.Lsub_subnormal:
+	/* The exponent is too small to shift away all the leading zeros.
+	   Set a6 to the current exponent (which has already been
+	   decremented by the borrow) so that the exponent of the result
+	   will be zero.  Do not add 1 to a6 in this case, because: (1)
+	   adding the mantissa will not increment the exponent, so there is
+	   no need to subtract anything extra from the exponent to
+	   compensate, and (2) the effective exponent of a subnormal is 1
+	   not 0 so the shift amount must be 1 smaller than normal. */
+	mov	a6, a10
+	j	.Lsub_normalize_shift
+
+#endif /* L_addsubsf3 */
+
+#ifdef L_mulsf3
+
+	/* Multiplication */
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
+__mulsf3_aux:
+
+	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
+	   (This code is placed before the start of the function just to
+	   keep it in range of the limited branch displacements.)  */
+
+.Lmul_xexpzero:
+	/* Clear the sign bit of x.  */
+	slli	a2, a2, 1
+	srli	a2, a2, 1
+
+	/* If x is zero, return zero.  */
+	beqz	a2, .Lmul_return_zero
+
+	/* Normalize x.  Adjust the exponent in a8.  */
+	do_nsau	a10, a2, a11, a12
+	addi	a10, a10, -8
+	ssl	a10
+	sll	a2, a2 
+	movi	a8, 1
+	sub	a8, a8, a10
+	j	.Lmul_xnormalized	
+	
+.Lmul_yexpzero:
+	/* Clear the sign bit of y.  */
+	slli	a3, a3, 1
+	srli	a3, a3, 1
+
+	/* If y is zero, return zero.  */
+	beqz	a3, .Lmul_return_zero
+
+	/* Normalize y.  Adjust the exponent in a9.  */
+	do_nsau	a10, a3, a11, a12
+	addi	a10, a10, -8
+	ssl	a10
+	sll	a3, a3
+	movi	a9, 1
+	sub	a9, a9, a10
+	j	.Lmul_ynormalized	
+
+.Lmul_return_zero:
+	/* Return zero with the appropriate sign bit.  */
+	srli	a2, a7, 31
+	slli	a2, a2, 31
+	j	.Lmul_done
+
+.Lmul_xnan_or_inf:
+	/* If y is zero, return NaN.  */
+	slli	a8, a3, 1
+	bnez	a8, 1f
+	movi	a4, 0x400000	/* make it a quiet NaN */
+	or	a2, a2, a4
+	j	.Lmul_done
+1:
+	/* If y is NaN, return y.  */
+	bnall	a3, a6, .Lmul_returnx
+	slli	a8, a3, 9
+	beqz	a8, .Lmul_returnx
+
+.Lmul_returny:
+	mov	a2, a3
+
+.Lmul_returnx:
+	/* Set the sign bit and return.  */
+	extui	a7, a7, 31, 1
+	slli	a2, a2, 1
+	ssai	1
+	src	a2, a7, a2
+	j	.Lmul_done
+
+.Lmul_ynan_or_inf:
+	/* If x is zero, return NaN.  */
+	slli	a8, a2, 1
+	bnez	a8, .Lmul_returny
+	movi	a7, 0x400000	/* make it a quiet NaN */
+	or	a2, a3, a7
+	j	.Lmul_done
+
+	.align	4
+	.global	__mulsf3
+	.type	__mulsf3, @function
+__mulsf3:
+#if __XTENSA_CALL0_ABI__
+	leaf_entry sp, 32
+	addi	sp, sp, -32
+	s32i	a12, sp, 16
+	s32i	a13, sp, 20
+	s32i	a14, sp, 24
+	s32i	a15, sp, 28
+#elif XCHAL_NO_MUL
+	/* This is not really a leaf function; allocate enough stack space
+	   to allow CALL12s to a helper function.  */
+	leaf_entry sp, 64
+#else
+	leaf_entry sp, 32
+#endif
+	movi	a6, 0x7f800000
+
+	/* Get the sign of the result.  */
+	xor	a7, a2, a3
+
+	/* Check for NaN and infinity.  */
+	ball	a2, a6, .Lmul_xnan_or_inf
+	ball	a3, a6, .Lmul_ynan_or_inf
+
+	/* Extract the exponents.  */
+	extui	a8, a2, 23, 8
+	extui	a9, a3, 23, 8
+
+	beqz	a8, .Lmul_xexpzero
+.Lmul_xnormalized:	
+	beqz	a9, .Lmul_yexpzero
+.Lmul_ynormalized:	
+
+	/* Add the exponents.  */
+	add	a8, a8, a9
+
+	/* Replace sign/exponent fields with explicit "1.0".  */
+	movi	a10, 0xffffff
+	or	a2, a2, a6
+	and	a2, a2, a10
+	or	a3, a3, a6
+	and	a3, a3, a10
+
+	/* Multiply 32x32 to 64 bits.  The result ends up in a2/a6.  */
+
+#if XCHAL_HAVE_MUL32_HIGH
+
+	mull	a6, a2, a3
+	muluh	a2, a2, a3
+
+#else
+
+	/* Break the inputs into 16-bit chunks and compute 4 32-bit partial
+	   products.  These partial products are:
+
+		0 xl * yl
+
+		1 xl * yh
+		2 xh * yl
+
+		3 xh * yh
+
+	   If using the Mul16 or Mul32 multiplier options, these input
+	   chunks must be stored in separate registers.  For Mac16, the
+	   UMUL.AA.* opcodes can specify that the inputs come from either
+	   half of the registers, so there is no need to shift them out
+	   ahead of time.  If there is no multiply hardware, the 16-bit
+	   chunks can be extracted when setting up the arguments to the
+	   separate multiply function.  */
+
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
+	/* Calling a separate multiply function will clobber a0 and requires
+	   use of a8 as a temporary, so save those values now.  (The function
+	   uses a custom ABI so nothing else needs to be saved.)  */
+	s32i	a0, sp, 0
+	s32i	a8, sp, 4
+#endif
+
+#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
+
+#define a2h a4
+#define a3h a5
+
+	/* Get the high halves of the inputs into registers.  */
+	srli	a2h, a2, 16
+	srli	a3h, a3, 16
+
+#define a2l a2
+#define a3l a3
+
+#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
+	/* Clear the high halves of the inputs.  This does not matter
+	   for MUL16 because the high bits are ignored.  */
+	extui	a2, a2, 0, 16
+	extui	a3, a3, 0, 16
+#endif
+#endif /* MUL16 || MUL32 */
+
+
+#if XCHAL_HAVE_MUL16
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	mul16u	dst, xreg ## xhalf, yreg ## yhalf
+
+#elif XCHAL_HAVE_MUL32
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	mull	dst, xreg ## xhalf, yreg ## yhalf
+
+#elif XCHAL_HAVE_MAC16
+
+/* The preprocessor insists on inserting a space when concatenating after
+   a period in the definition of do_mul below.  These macros are a workaround
+   using underscores instead of periods when doing the concatenation.  */
+#define umul_aa_ll umul.aa.ll
+#define umul_aa_lh umul.aa.lh
+#define umul_aa_hl umul.aa.hl
+#define umul_aa_hh umul.aa.hh
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
+	rsr	dst, ACCLO
+
+#else /* no multiply hardware */
+	
+#define set_arg_l(dst, src) \
+	extui	dst, src, 0, 16
+#define set_arg_h(dst, src) \
+	srli	dst, src, 16
+
+#if __XTENSA_CALL0_ABI__
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a13, xreg); \
+	set_arg_ ## yhalf (a14, yreg); \
+	call0	.Lmul_mulsi3; \
+	mov	dst, a12
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a14, xreg); \
+	set_arg_ ## yhalf (a15, yreg); \
+	call12	.Lmul_mulsi3; \
+	mov	dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
+
+	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
+	do_mul(a6, a2, l, a3, h)	/* pp 1 */
+	do_mul(a11, a2, h, a3, l)	/* pp 2 */
+	movi	a9, 0
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a9, a9, 1
+1:
+	/* Shift the high half of a9/a6 into position in a9.  Note that
+	   this value can be safely incremented without any carry-outs.  */
+	ssai	16
+	src	a9, a9, a6
+
+	/* Compute the low word into a6.  */
+	do_mul(a11, a2, l, a3, l)	/* pp 0 */
+	sll	a6, a6
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a9, a9, 1
+1:
+	/* Compute the high word into a2.  */
+	do_mul(a2, a2, h, a3, h)	/* pp 3 */
+	add	a2, a2, a9
+	
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
+	/* Restore values saved on the stack during the multiplication.  */
+	l32i	a0, sp, 0
+	l32i	a8, sp, 4
+#endif
+#endif /* ! XCHAL_HAVE_MUL32_HIGH */
+
+	/* Shift left by 9 bits, unless there was a carry-out from the
+	   multiply, in which case, shift by 8 bits and increment the
+	   exponent.  */
+	movi	a4, 9
+	srli	a5, a2, 24 - 9
+	beqz	a5, 1f
+	addi	a4, a4, -1
+	addi	a8, a8, 1
+1:	ssl	a4
+	src	a2, a2, a6
+	sll	a6, a6
+
+	/* Subtract the extra bias from the exponent sum (plus one to account
+	   for the explicit "1.0" of the mantissa that will be added to the
+	   exponent in the final result).  */
+	movi	a4, 0x80
+	sub	a8, a8, a4
+	
+	/* Check for over/underflow.  The value in a8 is one less than the
+	   final exponent, so values in the range 0..fd are OK here.  */
+	movi	a4, 0xfe
+	bgeu	a8, a4, .Lmul_overflow
+	
+.Lmul_round:
+	/* Round.  */
+	bgez	a6, .Lmul_rounded
+	addi	a2, a2, 1
+	slli	a6, a6, 1
+	beqz	a6, .Lmul_exactlyhalf
+
+.Lmul_rounded:
+	/* Add the exponent to the mantissa.  */
+	slli	a8, a8, 23
+	add	a2, a2, a8
+
+.Lmul_addsign:
+	/* Add the sign bit.  */
+	srli	a7, a7, 31
+	slli	a7, a7, 31
+	or	a2, a2, a7
+
+.Lmul_done:
+#if __XTENSA_CALL0_ABI__
+	l32i	a12, sp, 16
+	l32i	a13, sp, 20
+	l32i	a14, sp, 24
+	l32i	a15, sp, 28
+	addi	sp, sp, 32
+#endif
+	leaf_return
+
+.Lmul_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	a2, a2, 1
+	slli	a2, a2, 1
+	j	.Lmul_rounded
+
+.Lmul_overflow:
+	bltz	a8, .Lmul_underflow
+	/* Return +/- Infinity.  */
+	movi	a8, 0xff
+	slli	a2, a8, 23
+	j	.Lmul_addsign
+
+.Lmul_underflow:
+	/* Create a subnormal value, where the exponent field contains zero,
+	   but the effective exponent is 1.  The value of a8 is one less than
+	   the actual exponent, so just negate it to get the shift amount.  */
+	neg	a8, a8
+	mov	a9, a6
+	ssr	a8
+	bgeui	a8, 32, .Lmul_flush_to_zero
+	
+	/* Shift a2 right.  Any bits that are shifted out of a2 are saved
+	   in a6 (combined with the shifted-out bits currently in a6) for
+	   rounding the result.  */
+	sll	a6, a2
+	srl	a2, a2
+
+	/* Set the exponent to zero.  */
+	movi	a8, 0
+
+	/* Pack any nonzero bits shifted out into a6.  */
+	beqz	a9, .Lmul_round
+	movi	a9, 1
+	or	a6, a6, a9
+	j	.Lmul_round
+	
+.Lmul_flush_to_zero:
+	/* Return zero with the appropriate sign bit.  */
+	srli	a2, a7, 31
+	slli	a2, a2, 31
+	j	.Lmul_done
+
+#if XCHAL_NO_MUL
+	
+	/* For Xtensa processors with no multiply hardware, this simplified
+	   version of _mulsi3 is used for multiplying 16-bit chunks of
+	   the floating-point mantissas.  When using CALL0, this function
+	   uses a custom ABI: the inputs are passed in a13 and a14, the
+	   result is returned in a12, and a8 and a15 are clobbered.  */
+	.align	4
+.Lmul_mulsi3:
+	leaf_entry sp, 16
+	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+	movi	\dst, 0
+1:	add	\tmp1, \src2, \dst
+	extui	\tmp2, \src1, 0, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx2 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 1, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx4 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 2, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx8 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 3, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	srli	\src1, \src1, 4
+	slli	\src2, \src2, 4
+	bnez	\src1, 1b
+	.endm
+#if __XTENSA_CALL0_ABI__
+	mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+	/* The result will be written into a2, so save that argument in a4.  */
+	mov	a4, a2
+	mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+	leaf_return
+#endif /* XCHAL_NO_MUL */
+#endif /* L_mulsf3 */
+
+#ifdef L_divsf3
+
+	/* Division */
+__divsf3_aux:
+
+	/* Handle unusual cases (zeros, subnormals, NaNs and Infinities).
+	   (This code is placed before the start of the function just to
+	   keep it in range of the limited branch displacements.)  */
+
+.Ldiv_yexpzero:
+	/* Clear the sign bit of y.  */
+	slli	a3, a3, 1
+	srli	a3, a3, 1
+
+	/* Check for division by zero.  */
+	beqz	a3, .Ldiv_yzero
+
+	/* Normalize y.  Adjust the exponent in a9.  */
+	do_nsau	a10, a3, a4, a5
+	addi	a10, a10, -8
+	ssl	a10
+	sll	a3, a3
+	movi	a9, 1
+	sub	a9, a9, a10
+	j	.Ldiv_ynormalized	
+
+.Ldiv_yzero:
+	/* y is zero.  Return NaN if x is also zero; otherwise, infinity.  */
+	slli	a4, a2, 1
+	srli	a4, a4, 1
+	srli	a2, a7, 31
+	slli	a2, a2, 31
+	or	a2, a2, a6
+	bnez	a4, 1f
+	movi	a4, 0x400000	/* make it a quiet NaN */
+	or	a2, a2, a4
+1:	leaf_return
+
+.Ldiv_xexpzero:
+	/* Clear the sign bit of x.  */
+	slli	a2, a2, 1
+	srli	a2, a2, 1
+
+	/* If x is zero, return zero.  */
+	beqz	a2, .Ldiv_return_zero
+
+	/* Normalize x.  Adjust the exponent in a8.  */
+	do_nsau	a10, a2, a4, a5
+	addi	a10, a10, -8
+	ssl	a10
+	sll	a2, a2
+	movi	a8, 1
+	sub	a8, a8, a10
+	j	.Ldiv_xnormalized	
+	
+.Ldiv_return_zero:
+	/* Return zero with the appropriate sign bit.  */
+	srli	a2, a7, 31
+	slli	a2, a2, 31
+	leaf_return
+
+.Ldiv_xnan_or_inf:
+	/* Set the sign bit of the result.  */
+	srli	a7, a3, 31
+	slli	a7, a7, 31
+	xor	a2, a2, a7
+	/* If y is NaN or Inf, return NaN.  */
+	bnall	a3, a6, 1f
+	movi	a4, 0x400000	/* make it a quiet NaN */
+	or	a2, a2, a4
+1:	leaf_return
+
+.Ldiv_ynan_or_inf:
+	/* If y is Infinity, return zero.  */
+	slli	a8, a3, 9
+	beqz	a8, .Ldiv_return_zero
+	/* y is NaN; return it.  */
+	mov	a2, a3
+	leaf_return
+
+	.align	4
+	.global	__divsf3
+	.type	__divsf3, @function
+__divsf3:
+	leaf_entry sp, 16
+	movi	a6, 0x7f800000
+
+	/* Get the sign of the result.  */
+	xor	a7, a2, a3
+
+	/* Check for NaN and infinity.  */
+	ball	a2, a6, .Ldiv_xnan_or_inf
+	ball	a3, a6, .Ldiv_ynan_or_inf
+
+	/* Extract the exponents.  */
+	extui	a8, a2, 23, 8
+	extui	a9, a3, 23, 8
+
+	beqz	a9, .Ldiv_yexpzero
+.Ldiv_ynormalized:	
+	beqz	a8, .Ldiv_xexpzero
+.Ldiv_xnormalized:	
+
+	/* Subtract the exponents.  */
+	sub	a8, a8, a9
+
+	/* Replace sign/exponent fields with explicit "1.0".  */
+	movi	a10, 0xffffff
+	or	a2, a2, a6
+	and	a2, a2, a10
+	or	a3, a3, a6
+	and	a3, a3, a10
+
+	/* The first digit of the mantissa division must be a one.
+	   Shift x (and adjust the exponent) as needed to make this true.  */
+	bltu	a3, a2, 1f
+	slli	a2, a2, 1
+	addi	a8, a8, -1
+1:
+	/* Do the first subtraction and shift.  */
+	sub	a2, a2, a3
+	slli	a2, a2, 1
+
+	/* Put the quotient into a10.  */
+	movi	a10, 1
+
+	/* Divide one bit at a time for 23 bits.  */
+	movi	a9, 23
+#if XCHAL_HAVE_LOOPS
+	loop	a9, .Ldiv_loopend
+#endif
+.Ldiv_loop:
+	/* Shift the quotient << 1.  */
+	slli	a10, a10, 1
+
+	/* Is this digit a 0 or 1?  */
+	bltu	a2, a3, 1f
+
+	/* Output a 1 and subtract.  */
+	addi	a10, a10, 1
+	sub	a2, a2, a3
+
+	/* Shift the dividend << 1.  */
+1:	slli	a2, a2, 1
+
+#if !XCHAL_HAVE_LOOPS
+	addi	a9, a9, -1
+	bnez	a9, .Ldiv_loop
+#endif
+.Ldiv_loopend:
+
+	/* Add the exponent bias (less one to account for the explicit "1.0"
+	   of the mantissa that will be added to the exponent in the final
+	   result).  */
+	addi	a8, a8, 0x7e
+	
+	/* Check for over/underflow.  The value in a8 is one less than the
+	   final exponent, so values in the range 0..fd are OK here.  */
+	movi	a4, 0xfe
+	bgeu	a8, a4, .Ldiv_overflow
+	
+.Ldiv_round:
+	/* Round.  The remainder (<< 1) is in a2.  */
+	bltu	a2, a3, .Ldiv_rounded
+	addi	a10, a10, 1
+	beq	a2, a3, .Ldiv_exactlyhalf
+
+.Ldiv_rounded:
+	/* Add the exponent to the mantissa.  */
+	slli	a8, a8, 23
+	add	a2, a10, a8
+
+.Ldiv_addsign:
+	/* Add the sign bit.  */
+	srli	a7, a7, 31
+	slli	a7, a7, 31
+	or	a2, a2, a7
+	leaf_return
+
+.Ldiv_overflow:
+	bltz	a8, .Ldiv_underflow
+	/* Return +/- Infinity.  */
+	addi	a8, a4, 1	/* 0xff */
+	slli	a2, a8, 23
+	j	.Ldiv_addsign
+
+.Ldiv_exactlyhalf:
+	/* Remainder is exactly half the divisor.  Round even.  */
+	srli	a10, a10, 1
+	slli	a10, a10, 1
+	j	.Ldiv_rounded
+
+.Ldiv_underflow:
+	/* Create a subnormal value, where the exponent field contains zero,
+	   but the effective exponent is 1.  The value of a8 is one less than
+	   the actual exponent, so just negate it to get the shift amount.  */
+	neg	a8, a8
+	ssr	a8
+	bgeui	a8, 32, .Ldiv_flush_to_zero
+	
+	/* Shift a10 right.  Any bits that are shifted out of a10 are
+	   saved in a6 for rounding the result.  */
+	sll	a6, a10
+	srl	a10, a10
+
+	/* Set the exponent to zero.  */
+	movi	a8, 0
+
+	/* Pack any nonzero remainder (in a2) into a6.  */
+	beqz	a2, 1f
+	movi	a9, 1
+	or	a6, a6, a9
+	
+	/* Round a10 based on the bits shifted out into a6.  */
+1:	bgez	a6, .Ldiv_rounded
+	addi	a10, a10, 1
+	slli	a6, a6, 1
+	bnez	a6, .Ldiv_rounded
+	srli	a10, a10, 1
+	slli	a10, a10, 1
+	j	.Ldiv_rounded
+
+.Ldiv_flush_to_zero:
+	/* Return zero with the appropriate sign bit.  */
+	srli	a2, a7, 31
+	slli	a2, a2, 31
+	leaf_return
+
+#endif /* L_divsf3 */
+
+#ifdef L_cmpsf2
+
+	/* Equal and Not Equal */
+
+	.align	4
+	.global	__eqsf2
+	.global	__nesf2
+	.set	__nesf2, __eqsf2
+	.type	__eqsf2, @function
+__eqsf2:
+	leaf_entry sp, 16
+	bne	a2, a3, 4f
+
+	/* The values are equal but NaN != NaN.  Check the exponent.  */
+	movi	a6, 0x7f800000
+	ball	a2, a6, 3f
+
+	/* Equal.  */
+	movi	a2, 0
+	leaf_return
+
+	/* Not equal.  */
+2:	movi	a2, 1
+	leaf_return
+
+	/* Check if the mantissas are nonzero.  */
+3:	slli	a7, a2, 9
+	j	5f
+
+	/* Check if x and y are zero with different signs.  */
+4:	or	a7, a2, a3
+	slli	a7, a7, 1
+
+	/* Equal if a7 == 0, where a7 is either abs(x | y) or the mantissa
+	   or x when exponent(x) = 0x7f8 and x == y.  */
+5:	movi	a2, 0
+	movi	a3, 1
+	movnez	a2, a3, a7	
+	leaf_return
+
+
+	/* Greater Than */
+
+	.align	4
+	.global	__gtsf2
+	.type	__gtsf2, @function
+__gtsf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7f800000
+	ball	a2, a6, 2f
+1:	bnall	a3, a6, .Lle_cmp
+
+	/* Check if y is a NaN.  */
+	slli	a7, a3, 9
+	beqz	a7, .Lle_cmp
+	movi	a2, 0
+	leaf_return
+
+	/* Check if x is a NaN.  */
+2:	slli	a7, a2, 9
+	beqz	a7, 1b
+	movi	a2, 0
+	leaf_return
+
+
+	/* Less Than or Equal */
+
+	.align	4
+	.global	__lesf2
+	.type	__lesf2, @function
+__lesf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7f800000
+	ball	a2, a6, 2f
+1:	bnall	a3, a6, .Lle_cmp
+
+	/* Check if y is a NaN.  */
+	slli	a7, a3, 9
+	beqz	a7, .Lle_cmp
+	movi	a2, 1
+	leaf_return
+
+	/* Check if x is a NaN.  */
+2:	slli	a7, a2, 9
+	beqz	a7, 1b
+	movi	a2, 1
+	leaf_return
+
+.Lle_cmp:
+	/* Check if x and y have different signs.  */
+	xor	a7, a2, a3
+	bltz	a7, .Lle_diff_signs
+
+	/* Check if x is negative.  */
+	bltz	a2, .Lle_xneg
+
+	/* Check if x <= y.  */
+	bltu	a3, a2, 5f
+4:	movi	a2, 0
+	leaf_return
+
+.Lle_xneg:
+	/* Check if y <= x.  */
+	bgeu	a2, a3, 4b
+5:	movi	a2, 1
+	leaf_return
+
+.Lle_diff_signs:
+	bltz	a2, 4b
+
+	/* Check if both x and y are zero.  */
+	or	a7, a2, a3
+	slli	a7, a7, 1
+	movi	a2, 1
+	movi	a3, 0
+	moveqz	a2, a3, a7
+	leaf_return
+
+
+	/* Greater Than or Equal */
+
+	.align	4
+	.global	__gesf2
+	.type	__gesf2, @function
+__gesf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7f800000
+	ball	a2, a6, 2f
+1:	bnall	a3, a6, .Llt_cmp
+
+	/* Check if y is a NaN.  */
+	slli	a7, a3, 9
+	beqz	a7, .Llt_cmp
+	movi	a2, -1
+	leaf_return
+
+	/* Check if x is a NaN.  */
+2:	slli	a7, a2, 9
+	beqz	a7, 1b
+	movi	a2, -1
+	leaf_return
+
+
+	/* Less Than */
+
+	.align	4
+	.global	__ltsf2
+	.type	__ltsf2, @function
+__ltsf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7f800000
+	ball	a2, a6, 2f
+1:	bnall	a3, a6, .Llt_cmp
+
+	/* Check if y is a NaN.  */
+	slli	a7, a3, 9
+	beqz	a7, .Llt_cmp
+	movi	a2, 0
+	leaf_return
+
+	/* Check if x is a NaN.  */
+2:	slli	a7, a2, 9
+	beqz	a7, 1b
+	movi	a2, 0
+	leaf_return
+
+.Llt_cmp:
+	/* Check if x and y have different signs.  */
+	xor	a7, a2, a3
+	bltz	a7, .Llt_diff_signs
+
+	/* Check if x is negative.  */
+	bltz	a2, .Llt_xneg
+
+	/* Check if x < y.  */
+	bgeu	a2, a3, 5f
+4:	movi	a2, -1
+	leaf_return
+
+.Llt_xneg:
+	/* Check if y < x.  */
+	bltu	a3, a2, 4b
+5:	movi	a2, 0
+	leaf_return
+
+.Llt_diff_signs:
+	bgez	a2, 5b
+
+	/* Check if both x and y are nonzero.  */
+	or	a7, a2, a3
+	slli	a7, a7, 1
+	movi	a2, 0
+	movi	a3, -1
+	movnez	a2, a3, a7
+	leaf_return
+
+
+	/* Unordered */
+
+	.align	4
+	.global	__unordsf2
+	.type	__unordsf2, @function
+__unordsf2:
+	leaf_entry sp, 16
+	movi	a6, 0x7f800000
+	ball	a2, a6, 3f
+1:	ball	a3, a6, 4f
+2:	movi	a2, 0
+	leaf_return
+
+3:	slli	a7, a2, 9
+	beqz	a7, 1b
+	movi	a2, 1
+	leaf_return
+
+4:	slli	a7, a3, 9
+	beqz	a7, 2b
+	movi	a2, 1
+	leaf_return
+
+#endif /* L_cmpsf2 */
+
+#ifdef L_fixsfsi
+
+	.align	4
+	.global	__fixsfsi
+	.type	__fixsfsi, @function
+__fixsfsi:
+	leaf_entry sp, 16
+
+	/* Check for NaN and Infinity.  */
+	movi	a6, 0x7f800000
+	ball	a2, a6, .Lfixsfsi_nan_or_inf
+
+	/* Extract the exponent and check if 0 < (exp - 0x7e) < 32.  */
+	extui	a4, a2, 23, 8
+	addi	a4, a4, -0x7e
+	bgei	a4, 32, .Lfixsfsi_maxint
+	blti	a4, 1, .Lfixsfsi_zero
+
+	/* Add explicit "1.0" and shift << 8.  */
+	or	a7, a2, a6
+	slli	a5, a7, 8
+
+	/* Shift back to the right, based on the exponent.  */
+	ssl	a4		/* shift by 32 - a4 */
+	srl	a5, a5
+
+	/* Negate the result if sign != 0.  */
+	neg	a2, a5
+	movgez	a2, a5, a7
+	leaf_return
+
+.Lfixsfsi_nan_or_inf:
+	/* Handle Infinity and NaN.  */
+	slli	a4, a2, 9
+	beqz	a4, .Lfixsfsi_maxint
+
+	/* Translate NaN to +maxint.  */
+	movi	a2, 0
+
+.Lfixsfsi_maxint:
+	slli	a4, a6, 8	/* 0x80000000 */
+	addi	a5, a4, -1	/* 0x7fffffff */
+	movgez	a4, a5, a2
+	mov	a2, a4
+	leaf_return
+
+.Lfixsfsi_zero:
+	movi	a2, 0
+	leaf_return
+
+#endif /* L_fixsfsi */
+
+#ifdef L_fixsfdi
+
+	.align	4
+	.global	__fixsfdi
+	.type	__fixsfdi, @function
+__fixsfdi:
+	leaf_entry sp, 16
+
+	/* Check for NaN and Infinity.  */
+	movi	a6, 0x7f800000
+	ball	a2, a6, .Lfixsfdi_nan_or_inf
+
+	/* Extract the exponent and check if 0 < (exp - 0x7e) < 64.  */
+	extui	a4, a2, 23, 8
+	addi	a4, a4, -0x7e
+	bgei	a4, 64, .Lfixsfdi_maxint
+	blti	a4, 1, .Lfixsfdi_zero
+
+	/* Add explicit "1.0" and shift << 8.  */
+	or	a7, a2, a6
+	slli	xh, a7, 8
+
+	/* Shift back to the right, based on the exponent.  */
+	ssl	a4		/* shift by 64 - a4 */
+	bgei	a4, 32, .Lfixsfdi_smallshift
+	srl	xl, xh
+	movi	xh, 0
+
+.Lfixsfdi_shifted:	
+	/* Negate the result if sign != 0.  */
+	bgez	a7, 1f
+	neg	xl, xl
+	neg	xh, xh
+	beqz	xl, 1f
+	addi	xh, xh, -1
+1:	leaf_return
+
+.Lfixsfdi_smallshift:
+	movi	xl, 0
+	sll	xl, xh
+	srl	xh, xh
+	j	.Lfixsfdi_shifted
+
+.Lfixsfdi_nan_or_inf:
+	/* Handle Infinity and NaN.  */
+	slli	a4, a2, 9
+	beqz	a4, .Lfixsfdi_maxint
+
+	/* Translate NaN to +maxint.  */
+	movi	a2, 0
+
+.Lfixsfdi_maxint:
+	slli	a7, a6, 8	/* 0x80000000 */
+	bgez	a2, 1f
+	mov	xh, a7
+	movi	xl, 0
+	leaf_return
+
+1:	addi	xh, a7, -1	/* 0x7fffffff */
+	movi	xl, -1
+	leaf_return
+
+.Lfixsfdi_zero:
+	movi	xh, 0
+	movi	xl, 0
+	leaf_return
+
+#endif /* L_fixsfdi */
+
+#ifdef L_fixunssfsi
+
+	.align	4
+	.global	__fixunssfsi
+	.type	__fixunssfsi, @function
+__fixunssfsi:
+	leaf_entry sp, 16
+
+	/* Check for NaN and Infinity.  */
+	movi	a6, 0x7f800000
+	ball	a2, a6, .Lfixunssfsi_nan_or_inf
+
+	/* Extract the exponent and check if 0 <= (exp - 0x7f) < 32.  */
+	extui	a4, a2, 23, 8
+	addi	a4, a4, -0x7f
+	bgei	a4, 32, .Lfixunssfsi_maxint
+	bltz	a4, .Lfixunssfsi_zero
+
+	/* Add explicit "1.0" and shift << 8.  */
+	or	a7, a2, a6
+	slli	a5, a7, 8
+
+	/* Shift back to the right, based on the exponent.  */
+	addi	a4, a4, 1
+	beqi	a4, 32, .Lfixunssfsi_bigexp
+	ssl	a4		/* shift by 32 - a4 */
+	srl	a5, a5
+
+	/* Negate the result if sign != 0.  */
+	neg	a2, a5
+	movgez	a2, a5, a7
+	leaf_return
+
+.Lfixunssfsi_nan_or_inf:
+	/* Handle Infinity and NaN.  */
+	slli	a4, a2, 9
+	beqz	a4, .Lfixunssfsi_maxint
+
+	/* Translate NaN to 0xffffffff.  */
+	movi	a2, -1
+	leaf_return
+
+.Lfixunssfsi_maxint:
+	slli	a4, a6, 8	/* 0x80000000 */
+	movi	a5, -1		/* 0xffffffff */
+	movgez	a4, a5, a2
+	mov	a2, a4
+	leaf_return
+
+.Lfixunssfsi_zero:
+	movi	a2, 0
+	leaf_return
+
+.Lfixunssfsi_bigexp:
+	/* Handle unsigned maximum exponent case.  */
+	bltz	a2, 1f
+	mov	a2, a5		/* no shift needed */
+	leaf_return
+
+	/* Return 0x80000000 if negative.  */
+1:	slli	a2, a6, 8
+	leaf_return
+
+#endif /* L_fixunssfsi */
+
+#ifdef L_fixunssfdi
+
+	.align	4
+	.global	__fixunssfdi
+	.type	__fixunssfdi, @function
+__fixunssfdi:
+	leaf_entry sp, 16
+
+	/* Check for NaN and Infinity.  */
+	movi	a6, 0x7f800000
+	ball	a2, a6, .Lfixunssfdi_nan_or_inf
+
+	/* Extract the exponent and check if 0 <= (exp - 0x7f) < 64.  */
+	extui	a4, a2, 23, 8
+	addi	a4, a4, -0x7f
+	bgei	a4, 64, .Lfixunssfdi_maxint
+	bltz	a4, .Lfixunssfdi_zero
+
+	/* Add explicit "1.0" and shift << 8.  */
+	or	a7, a2, a6
+	slli	xh, a7, 8
+
+	/* Shift back to the right, based on the exponent.  */
+	addi	a4, a4, 1
+	beqi	a4, 64, .Lfixunssfdi_bigexp
+	ssl	a4		/* shift by 64 - a4 */
+	bgei	a4, 32, .Lfixunssfdi_smallshift
+	srl	xl, xh
+	movi	xh, 0
+
+.Lfixunssfdi_shifted:
+	/* Negate the result if sign != 0.  */
+	bgez	a7, 1f
+	neg	xl, xl
+	neg	xh, xh
+	beqz	xl, 1f
+	addi	xh, xh, -1
+1:	leaf_return
+
+.Lfixunssfdi_smallshift:
+	movi	xl, 0
+	src	xl, xh, xl
+	srl	xh, xh
+	j	.Lfixunssfdi_shifted
+
+.Lfixunssfdi_nan_or_inf:
+	/* Handle Infinity and NaN.  */
+	slli	a4, a2, 9
+	beqz	a4, .Lfixunssfdi_maxint
+
+	/* Translate NaN to 0xffffffff.... */
+1:	movi	xh, -1
+	movi	xl, -1
+	leaf_return
+
+.Lfixunssfdi_maxint:
+	bgez	a2, 1b
+2:	slli	xh, a6, 8	/* 0x80000000 */
+	movi	xl, 0
+	leaf_return
+
+.Lfixunssfdi_zero:
+	movi	xh, 0
+	movi	xl, 0
+	leaf_return
+
+.Lfixunssfdi_bigexp:
+	/* Handle unsigned maximum exponent case.  */
+	bltz	a7, 2b
+	movi	xl, 0
+	leaf_return		/* no shift needed */
+
+#endif /* L_fixunssfdi */
+
+#ifdef L_floatsisf
+
+	.align	4
+	.global	__floatunsisf
+	.type	__floatunsisf, @function
+__floatunsisf:
+	leaf_entry sp, 16
+	beqz	a2, .Lfloatsisf_return
+
+	/* Set the sign to zero and jump to the floatsisf code.  */
+	movi	a7, 0
+	j	.Lfloatsisf_normalize
+
+	.align	4
+	.global	__floatsisf
+	.type	__floatsisf, @function
+__floatsisf:
+	leaf_entry sp, 16
+
+	/* Check for zero.  */
+	beqz	a2, .Lfloatsisf_return
+
+	/* Save the sign.  */
+	extui	a7, a2, 31, 1
+
+	/* Get the absolute value.  */
+#if XCHAL_HAVE_ABS
+	abs	a2, a2
+#else
+	neg	a4, a2
+	movltz	a2, a4, a2
+#endif
+
+.Lfloatsisf_normalize:
+	/* Normalize with the first 1 bit in the msb.  */
+	do_nsau	a4, a2, a5, a6
+	ssl	a4
+	sll	a5, a2
+
+	/* Shift the mantissa into position, with rounding bits in a6.  */
+	srli	a2, a5, 8
+	slli	a6, a5, (32 - 8)
+
+	/* Set the exponent.  */
+	movi	a5, 0x9d	/* 0x7e + 31 */
+	sub	a5, a5, a4
+	slli	a5, a5, 23
+	add	a2, a2, a5
+
+	/* Add the sign.  */
+	slli	a7, a7, 31
+	or	a2, a2, a7
+
+	/* Round up if the leftover fraction is >= 1/2.  */
+	bgez	a6, .Lfloatsisf_return
+	addi	a2, a2, 1	/* Overflow to the exponent is OK.  */
+
+	/* Check if the leftover fraction is exactly 1/2.  */
+	slli	a6, a6, 1
+	beqz	a6, .Lfloatsisf_exactlyhalf
+
+.Lfloatsisf_return:
+	leaf_return
+
+.Lfloatsisf_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	a2, a2, 1
+	slli	a2, a2, 1
+	leaf_return
+
+#endif /* L_floatsisf */
+
+#ifdef L_floatdisf
+
+	.align	4
+	.global	__floatundisf
+	.type	__floatundisf, @function
+__floatundisf:
+	leaf_entry sp, 16
+
+	/* Check for zero.  */
+	or	a4, xh, xl
+	beqz	a4, 2f
+
+	/* Set the sign to zero and jump to the floatdisf code.  */
+	movi	a7, 0
+	j	.Lfloatdisf_normalize
+
+	.align	4
+	.global	__floatdisf
+	.type	__floatdisf, @function
+__floatdisf:
+	leaf_entry sp, 16
+
+	/* Check for zero.  */
+	or	a4, xh, xl
+	beqz	a4, 2f
+
+	/* Save the sign.  */
+	extui	a7, xh, 31, 1
+
+	/* Get the absolute value.  */
+	bgez	xh, .Lfloatdisf_normalize
+	neg	xl, xl
+	neg	xh, xh
+	beqz	xl, .Lfloatdisf_normalize
+	addi	xh, xh, -1
+
+.Lfloatdisf_normalize:
+	/* Normalize with the first 1 bit in the msb of xh.  */
+	beqz	xh, .Lfloatdisf_bigshift
+	do_nsau	a4, xh, a5, a6
+	ssl	a4
+	src	xh, xh, xl
+	sll	xl, xl
+
+.Lfloatdisf_shifted:
+	/* Shift the mantissa into position, with rounding bits in a6.  */
+	ssai	8
+	sll	a5, xl
+	src	a6, xh, xl
+	srl	xh, xh
+	beqz	a5, 1f
+	movi	a5, 1
+	or	a6, a6, a5
+1:
+	/* Set the exponent.  */
+	movi	a5, 0xbd	/* 0x7e + 63 */
+	sub	a5, a5, a4
+	slli	a5, a5, 23
+	add	a2, xh, a5
+
+	/* Add the sign.  */
+	slli	a7, a7, 31
+	or	a2, a2, a7
+
+	/* Round up if the leftover fraction is >= 1/2.  */
+	bgez	a6, 2f
+	addi	a2, a2, 1	/* Overflow to the exponent is OK.  */
+
+	/* Check if the leftover fraction is exactly 1/2.  */
+	slli	a6, a6, 1
+	beqz	a6, .Lfloatdisf_exactlyhalf
+2:	leaf_return
+
+.Lfloatdisf_bigshift:
+	/* xh is zero.  Normalize with first 1 bit of xl in the msb of xh.  */
+	do_nsau	a4, xl, a5, a6
+	ssl	a4
+	sll	xh, xl
+	movi	xl, 0
+	addi	a4, a4, 32
+	j	.Lfloatdisf_shifted
+
+.Lfloatdisf_exactlyhalf:
+	/* Round down to the nearest even value.  */
+	srli	a2, a2, 1
+	slli	a2, a2, 1
+	leaf_return
+
+#endif /* L_floatdisf */
diff --git a/libgcc/config/xtensa/lib1funcs.S b/libgcc/config/xtensa/lib1funcs.S
new file mode 100644
index 00000000000..071b9171177
--- /dev/null
+++ b/libgcc/config/xtensa/lib1funcs.S
@@ -0,0 +1,845 @@
+/* Assembly functions for the Xtensa version of libgcc1.
+   Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009
+   Free Software Foundation, Inc.
+   Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "xtensa-config.h"
+
+/* Define macros for the ABS and ADDX* instructions to handle cases
+   where they are not included in the Xtensa processor configuration.  */
+
+	.macro	do_abs dst, src, tmp
+#if XCHAL_HAVE_ABS
+	abs	\dst, \src
+#else
+	neg	\tmp, \src
+	movgez	\tmp, \src, \src
+	mov	\dst, \tmp
+#endif
+	.endm
+
+	.macro	do_addx2 dst, as, at, tmp
+#if XCHAL_HAVE_ADDX
+	addx2	\dst, \as, \at
+#else
+	slli	\tmp, \as, 1
+	add	\dst, \tmp, \at
+#endif
+	.endm
+
+	.macro	do_addx4 dst, as, at, tmp
+#if XCHAL_HAVE_ADDX
+	addx4	\dst, \as, \at
+#else
+	slli	\tmp, \as, 2
+	add	\dst, \tmp, \at
+#endif
+	.endm
+
+	.macro	do_addx8 dst, as, at, tmp
+#if XCHAL_HAVE_ADDX
+	addx8	\dst, \as, \at
+#else
+	slli	\tmp, \as, 3
+	add	\dst, \tmp, \at
+#endif
+	.endm
+
+/* Define macros for leaf function entry and return, supporting either the
+   standard register windowed ABI or the non-windowed call0 ABI.  These
+   macros do not allocate any extra stack space, so they only work for
+   leaf functions that do not need to spill anything to the stack.  */
+
+	.macro leaf_entry reg, size
+#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
+	entry \reg, \size
+#else
+	/* do nothing */
+#endif
+	.endm
+
+	.macro leaf_return
+#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
+	retw
+#else
+	ret
+#endif
+	.endm
+
+
+#ifdef L_mulsi3
+	.align	4
+	.global	__mulsi3
+	.type	__mulsi3, @function
+__mulsi3:
+	leaf_entry sp, 16
+
+#if XCHAL_HAVE_MUL32
+	mull	a2, a2, a3
+
+#elif XCHAL_HAVE_MUL16
+	or	a4, a2, a3
+	srai	a4, a4, 16
+	bnez	a4, .LMUL16
+	mul16u	a2, a2, a3
+	leaf_return
+.LMUL16:
+	srai	a4, a2, 16
+	srai	a5, a3, 16
+	mul16u	a7, a4, a3
+	mul16u	a6, a5, a2
+	mul16u	a4, a2, a3
+	add	a7, a7, a6
+	slli	a7, a7, 16
+	add	a2, a7, a4
+
+#elif XCHAL_HAVE_MAC16
+	mul.aa.hl a2, a3
+	mula.aa.lh a2, a3
+	rsr	a5, ACCLO
+	umul.aa.ll a2, a3
+	rsr	a4, ACCLO
+	slli	a5, a5, 16
+	add	a2, a4, a5
+
+#else /* !MUL32 && !MUL16 && !MAC16 */
+
+	/* Multiply one bit at a time, but unroll the loop 4x to better
+	   exploit the addx instructions and avoid overhead.
+	   Peel the first iteration to save a cycle on init.  */
+
+	/* Avoid negative numbers.  */
+	xor	a5, a2, a3	/* Top bit is 1 if one input is negative.  */
+	do_abs	a3, a3, a6
+	do_abs	a2, a2, a6
+
+	/* Swap so the second argument is smaller.  */
+	sub	a7, a2, a3
+	mov	a4, a3
+	movgez	a4, a2, a7	/* a4 = max (a2, a3) */
+	movltz	a3, a2, a7	/* a3 = min (a2, a3) */
+
+	movi	a2, 0
+	extui	a6, a3, 0, 1
+	movnez	a2, a4, a6
+
+	do_addx2 a7, a4, a2, a7
+	extui	a6, a3, 1, 1
+	movnez	a2, a7, a6
+
+	do_addx4 a7, a4, a2, a7
+	extui	a6, a3, 2, 1
+	movnez	a2, a7, a6
+
+	do_addx8 a7, a4, a2, a7
+	extui	a6, a3, 3, 1
+	movnez	a2, a7, a6
+
+	bgeui	a3, 16, .Lmult_main_loop
+	neg	a3, a2
+	movltz	a2, a3, a5
+	leaf_return
+
+	.align	4
+.Lmult_main_loop:
+	srli	a3, a3, 4
+	slli	a4, a4, 4
+
+	add	a7, a4, a2
+	extui	a6, a3, 0, 1
+	movnez	a2, a7, a6
+
+	do_addx2 a7, a4, a2, a7
+	extui	a6, a3, 1, 1
+	movnez	a2, a7, a6
+
+	do_addx4 a7, a4, a2, a7
+	extui	a6, a3, 2, 1
+	movnez	a2, a7, a6
+
+	do_addx8 a7, a4, a2, a7
+	extui	a6, a3, 3, 1
+	movnez	a2, a7, a6
+
+	bgeui	a3, 16, .Lmult_main_loop
+
+	neg	a3, a2
+	movltz	a2, a3, a5
+
+#endif /* !MUL32 && !MUL16 && !MAC16 */
+
+	leaf_return
+	.size	__mulsi3, . - __mulsi3
+
+#endif /* L_mulsi3 */
+
+
+#ifdef L_umulsidi3
+
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
+
+	.align	4
+	.global	__umulsidi3
+	.type	__umulsidi3, @function
+__umulsidi3:
+#if __XTENSA_CALL0_ABI__
+	leaf_entry sp, 32
+	addi	sp, sp, -32
+	s32i	a12, sp, 16
+	s32i	a13, sp, 20
+	s32i	a14, sp, 24
+	s32i	a15, sp, 28
+#elif XCHAL_NO_MUL
+	/* This is not really a leaf function; allocate enough stack space
+	   to allow CALL12s to a helper function.  */
+	leaf_entry sp, 48
+#else
+	leaf_entry sp, 16
+#endif
+
+#ifdef __XTENSA_EB__
+#define wh a2
+#define wl a3
+#else
+#define wh a3
+#define wl a2
+#endif /* __XTENSA_EB__ */
+
+	/* This code is taken from the mulsf3 routine in ieee754-sf.S.
+	   See more comments there.  */
+
+#if XCHAL_HAVE_MUL32_HIGH
+	mull	a6, a2, a3
+	muluh	wh, a2, a3
+	mov	wl, a6
+
+#else /* ! MUL32_HIGH */
+
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
+	/* a0 and a8 will be clobbered by calling the multiply function
+	   but a8 is not used here and need not be saved.  */
+	s32i	a0, sp, 0
+#endif
+
+#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
+
+#define a2h a4
+#define a3h a5
+
+	/* Get the high halves of the inputs into registers.  */
+	srli	a2h, a2, 16
+	srli	a3h, a3, 16
+
+#define a2l a2
+#define a3l a3
+
+#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
+	/* Clear the high halves of the inputs.  This does not matter
+	   for MUL16 because the high bits are ignored.  */
+	extui	a2, a2, 0, 16
+	extui	a3, a3, 0, 16
+#endif
+#endif /* MUL16 || MUL32 */
+
+
+#if XCHAL_HAVE_MUL16
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	mul16u	dst, xreg ## xhalf, yreg ## yhalf
+
+#elif XCHAL_HAVE_MUL32
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	mull	dst, xreg ## xhalf, yreg ## yhalf
+
+#elif XCHAL_HAVE_MAC16
+
+/* The preprocessor insists on inserting a space when concatenating after
+   a period in the definition of do_mul below.  These macros are a workaround
+   using underscores instead of periods when doing the concatenation.  */
+#define umul_aa_ll umul.aa.ll
+#define umul_aa_lh umul.aa.lh
+#define umul_aa_hl umul.aa.hl
+#define umul_aa_hh umul.aa.hh
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	umul_aa_ ## xhalf ## yhalf	xreg, yreg; \
+	rsr	dst, ACCLO
+
+#else /* no multiply hardware */
+
+#define set_arg_l(dst, src) \
+	extui	dst, src, 0, 16
+#define set_arg_h(dst, src) \
+	srli	dst, src, 16
+
+#if __XTENSA_CALL0_ABI__
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a13, xreg); \
+	set_arg_ ## yhalf (a14, yreg); \
+	call0	.Lmul_mulsi3; \
+	mov	dst, a12
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+	set_arg_ ## xhalf (a14, xreg); \
+	set_arg_ ## yhalf (a15, yreg); \
+	call12	.Lmul_mulsi3; \
+	mov	dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
+
+	/* Add pp1 and pp2 into a6 with carry-out in a9.  */
+	do_mul(a6, a2, l, a3, h)	/* pp 1 */
+	do_mul(a11, a2, h, a3, l)	/* pp 2 */
+	movi	a9, 0
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a9, a9, 1
+1:
+	/* Shift the high half of a9/a6 into position in a9.  Note that
+	   this value can be safely incremented without any carry-outs.  */
+	ssai	16
+	src	a9, a9, a6
+
+	/* Compute the low word into a6.  */
+	do_mul(a11, a2, l, a3, l)	/* pp 0 */
+	sll	a6, a6
+	add	a6, a6, a11
+	bgeu	a6, a11, 1f
+	addi	a9, a9, 1
+1:
+	/* Compute the high word into wh.  */
+	do_mul(wh, a2, h, a3, h)	/* pp 3 */
+	add	wh, wh, a9
+	mov	wl, a6
+
+#endif /* !MUL32_HIGH */
+
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
+	/* Restore the original return address.  */
+	l32i	a0, sp, 0
+#endif
+#if __XTENSA_CALL0_ABI__
+	l32i	a12, sp, 16
+	l32i	a13, sp, 20
+	l32i	a14, sp, 24
+	l32i	a15, sp, 28
+	addi	sp, sp, 32
+#endif
+	leaf_return
+
+#if XCHAL_NO_MUL
+
+	/* For Xtensa processors with no multiply hardware, this simplified
+	   version of _mulsi3 is used for multiplying 16-bit chunks of
+	   the floating-point mantissas.  When using CALL0, this function
+	   uses a custom ABI: the inputs are passed in a13 and a14, the
+	   result is returned in a12, and a8 and a15 are clobbered.  */
+	.align	4
+.Lmul_mulsi3:
+	leaf_entry sp, 16
+	.macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+	movi	\dst, 0
+1:	add	\tmp1, \src2, \dst
+	extui	\tmp2, \src1, 0, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx2 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 1, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx4 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 2, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	do_addx8 \tmp1, \src2, \dst, \tmp1
+	extui	\tmp2, \src1, 3, 1
+	movnez	\dst, \tmp1, \tmp2
+
+	srli	\src1, \src1, 4
+	slli	\src2, \src2, 4
+	bnez	\src1, 1b
+	.endm
+#if __XTENSA_CALL0_ABI__
+	mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+	/* The result will be written into a2, so save that argument in a4.  */
+	mov	a4, a2
+	mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+	leaf_return
+#endif /* XCHAL_NO_MUL */
+
+	.size	__umulsidi3, . - __umulsidi3
+
+#endif /* L_umulsidi3 */
+
+
+/* Define a macro for the NSAU (unsigned normalize shift amount)
+   instruction, which computes the number of leading zero bits,
+   to handle cases where it is not included in the Xtensa processor
+   configuration.  */
+
+	.macro	do_nsau cnt, val, tmp, a
+#if XCHAL_HAVE_NSA
+	nsau	\cnt, \val
+#else
+	mov	\a, \val
+	movi	\cnt, 0
+	extui	\tmp, \a, 16, 16
+	bnez	\tmp, 0f
+	movi	\cnt, 16
+	slli	\a, \a, 16
+0:
+	extui	\tmp, \a, 24, 8
+	bnez	\tmp, 1f
+	addi	\cnt, \cnt, 8
+	slli	\a, \a, 8
+1:
+	movi	\tmp, __nsau_data
+	extui	\a, \a, 24, 8
+	add	\tmp, \tmp, \a
+	l8ui	\tmp, \tmp, 0
+	add	\cnt, \cnt, \tmp
+#endif /* !XCHAL_HAVE_NSA */
+	.endm
+
+#ifdef L_clz
+	.section .rodata
+	.align	4
+	.global	__nsau_data
+	.type	__nsau_data, @object
+__nsau_data:
+#if !XCHAL_HAVE_NSA
+	.byte	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
+	.byte	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+	.byte	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+	.byte	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+	.byte	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	.byte	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+#endif /* !XCHAL_HAVE_NSA */
+	.size	__nsau_data, . - __nsau_data
+	.hidden	__nsau_data
+#endif /* L_clz */
+
+
+#ifdef L_clzsi2
+	.align	4
+	.global	__clzsi2
+	.type	__clzsi2, @function
+__clzsi2:
+	leaf_entry sp, 16
+	do_nsau	a2, a2, a3, a4
+	leaf_return
+	.size	__clzsi2, . - __clzsi2
+
+#endif /* L_clzsi2 */
+
+
+#ifdef L_ctzsi2
+	.align	4
+	.global	__ctzsi2
+	.type	__ctzsi2, @function
+__ctzsi2:
+	leaf_entry sp, 16
+	neg	a3, a2
+	and	a3, a3, a2
+	do_nsau	a2, a3, a4, a5
+	neg	a2, a2
+	addi	a2, a2, 31
+	leaf_return
+	.size	__ctzsi2, . - __ctzsi2
+
+#endif /* L_ctzsi2 */
+
+
+#ifdef L_ffssi2
+	.align	4
+	.global	__ffssi2
+	.type	__ffssi2, @function
+__ffssi2:
+	leaf_entry sp, 16
+	neg	a3, a2
+	and	a3, a3, a2
+	do_nsau	a2, a3, a4, a5
+	neg	a2, a2
+	addi	a2, a2, 32
+	leaf_return
+	.size	__ffssi2, . - __ffssi2
+
+#endif /* L_ffssi2 */
+
+
+#ifdef L_udivsi3
+	.align	4
+	.global	__udivsi3
+	.type	__udivsi3, @function
+__udivsi3:
+	leaf_entry sp, 16
+#if XCHAL_HAVE_DIV32
+	quou	a2, a2, a3
+#else
+	bltui	a3, 2, .Lle_one	/* check if the divisor <= 1 */
+
+	mov	a6, a2		/* keep dividend in a6 */
+	do_nsau	a5, a6, a2, a7	/* dividend_shift = nsau (dividend) */
+	do_nsau	a4, a3, a2, a7	/* divisor_shift = nsau (divisor) */
+	bgeu	a5, a4, .Lspecial
+
+	sub	a4, a4, a5	/* count = divisor_shift - dividend_shift */
+	ssl	a4
+	sll	a3, a3		/* divisor <<= count */
+	movi	a2, 0		/* quotient = 0 */
+
+	/* test-subtract-and-shift loop; one quotient bit on each iteration */
+#if XCHAL_HAVE_LOOPS
+	loopnez	a4, .Lloopend
+#endif /* XCHAL_HAVE_LOOPS */
+.Lloop:
+	bltu	a6, a3, .Lzerobit
+	sub	a6, a6, a3
+	addi	a2, a2, 1
+.Lzerobit:
+	slli	a2, a2, 1
+	srli	a3, a3, 1
+#if !XCHAL_HAVE_LOOPS
+	addi	a4, a4, -1
+	bnez	a4, .Lloop
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lloopend:
+
+	bltu	a6, a3, .Lreturn
+	addi	a2, a2, 1	/* increment quotient if dividend >= divisor */
+.Lreturn:
+	leaf_return
+
+.Lle_one:
+	beqz	a3, .Lerror	/* if divisor == 1, return the dividend */
+	leaf_return
+
+.Lspecial:
+	/* return dividend >= divisor */
+	bltu	a6, a3, .Lreturn0
+	movi	a2, 1
+	leaf_return
+
+.Lerror:
+	/* Divide by zero: Use an illegal instruction to force an exception.
+	   The subsequent "DIV0" string can be recognized by the exception
+	   handler to identify the real cause of the exception.  */
+	ill
+	.ascii	"DIV0"
+
+.Lreturn0:
+	movi	a2, 0
+#endif /* XCHAL_HAVE_DIV32 */
+	leaf_return
+	.size	__udivsi3, . - __udivsi3
+
+#endif /* L_udivsi3 */
+
+
+#ifdef L_divsi3
+	.align	4
+	.global	__divsi3
+	.type	__divsi3, @function
+__divsi3:
+	leaf_entry sp, 16
+#if XCHAL_HAVE_DIV32
+	quos	a2, a2, a3
+#else
+	xor	a7, a2, a3	/* sign = dividend ^ divisor */
+	do_abs	a6, a2, a4	/* udividend = abs (dividend) */
+	do_abs	a3, a3, a4	/* udivisor = abs (divisor) */
+	bltui	a3, 2, .Lle_one	/* check if udivisor <= 1 */
+	do_nsau	a5, a6, a2, a8	/* udividend_shift = nsau (udividend) */
+	do_nsau	a4, a3, a2, a8	/* udivisor_shift = nsau (udivisor) */
+	bgeu	a5, a4, .Lspecial
+
+	sub	a4, a4, a5	/* count = udivisor_shift - udividend_shift */
+	ssl	a4
+	sll	a3, a3		/* udivisor <<= count */
+	movi	a2, 0		/* quotient = 0 */
+
+	/* test-subtract-and-shift loop; one quotient bit on each iteration */
+#if XCHAL_HAVE_LOOPS
+	loopnez	a4, .Lloopend
+#endif /* XCHAL_HAVE_LOOPS */
+.Lloop:
+	bltu	a6, a3, .Lzerobit
+	sub	a6, a6, a3
+	addi	a2, a2, 1
+.Lzerobit:
+	slli	a2, a2, 1
+	srli	a3, a3, 1
+#if !XCHAL_HAVE_LOOPS
+	addi	a4, a4, -1
+	bnez	a4, .Lloop
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lloopend:
+
+	bltu	a6, a3, .Lreturn
+	addi	a2, a2, 1	/* increment if udividend >= udivisor */
+.Lreturn:
+	neg	a5, a2
+	movltz	a2, a5, a7	/* return (sign < 0) ? -quotient : quotient */
+	leaf_return
+
+.Lle_one:
+	beqz	a3, .Lerror
+	neg	a2, a6		/* if udivisor == 1, then return... */
+	movgez	a2, a6, a7	/* (sign < 0) ? -udividend : udividend */
+	leaf_return
+
+.Lspecial:
+	bltu	a6, a3, .Lreturn0 /* if dividend < divisor, return 0 */
+	movi	a2, 1
+	movi	a4, -1
+	movltz	a2, a4, a7	/* else return (sign < 0) ? -1 : 1 */
+	leaf_return
+
+.Lerror:
+	/* Divide by zero: Use an illegal instruction to force an exception.
+	   The subsequent "DIV0" string can be recognized by the exception
+	   handler to identify the real cause of the exception.  */
+	ill
+	.ascii	"DIV0"
+
+.Lreturn0:
+	movi	a2, 0
+#endif /* XCHAL_HAVE_DIV32 */
+	leaf_return
+	.size	__divsi3, . - __divsi3
+
+#endif /* L_divsi3 */
+
+
+#ifdef L_umodsi3
+	.align	4
+	.global	__umodsi3
+	.type	__umodsi3, @function
+__umodsi3:
+	leaf_entry sp, 16
+#if XCHAL_HAVE_DIV32
+	remu	a2, a2, a3
+#else
+	bltui	a3, 2, .Lle_one	/* check if the divisor is <= 1 */
+
+	do_nsau	a5, a2, a6, a7	/* dividend_shift = nsau (dividend) */
+	do_nsau	a4, a3, a6, a7	/* divisor_shift = nsau (divisor) */
+	bgeu	a5, a4, .Lspecial
+
+	sub	a4, a4, a5	/* count = divisor_shift - dividend_shift */
+	ssl	a4
+	sll	a3, a3		/* divisor <<= count */
+
+	/* test-subtract-and-shift loop */
+#if XCHAL_HAVE_LOOPS
+	loopnez	a4, .Lloopend
+#endif /* XCHAL_HAVE_LOOPS */
+.Lloop:
+	bltu	a2, a3, .Lzerobit
+	sub	a2, a2, a3
+.Lzerobit:
+	srli	a3, a3, 1
+#if !XCHAL_HAVE_LOOPS
+	addi	a4, a4, -1
+	bnez	a4, .Lloop
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lloopend:
+
+.Lspecial:
+	bltu	a2, a3, .Lreturn
+	sub	a2, a2, a3	/* subtract once more if dividend >= divisor */
+.Lreturn:
+	leaf_return
+
+.Lle_one:
+	bnez	a3, .Lreturn0
+
+	/* Divide by zero: Use an illegal instruction to force an exception.
+	   The subsequent "DIV0" string can be recognized by the exception
+	   handler to identify the real cause of the exception.  */
+	ill
+	.ascii	"DIV0"
+
+.Lreturn0:
+	movi	a2, 0
+#endif /* XCHAL_HAVE_DIV32 */
+	leaf_return
+	.size	__umodsi3, . - __umodsi3
+
+#endif /* L_umodsi3 */
+
+
+#ifdef L_modsi3
+	.align	4
+	.global	__modsi3
+	.type	__modsi3, @function
+__modsi3:
+	leaf_entry sp, 16
+#if XCHAL_HAVE_DIV32
+	rems	a2, a2, a3
+#else
+	mov	a7, a2		/* save original (signed) dividend */
+	do_abs	a2, a2, a4	/* udividend = abs (dividend) */
+	do_abs	a3, a3, a4	/* udivisor = abs (divisor) */
+	bltui	a3, 2, .Lle_one	/* check if udivisor <= 1 */
+	do_nsau	a5, a2, a6, a8	/* udividend_shift = nsau (udividend) */
+	do_nsau	a4, a3, a6, a8	/* udivisor_shift = nsau (udivisor) */
+	bgeu	a5, a4, .Lspecial
+
+	sub	a4, a4, a5	/* count = udivisor_shift - udividend_shift */
+	ssl	a4
+	sll	a3, a3		/* udivisor <<= count */
+
+	/* test-subtract-and-shift loop */
+#if XCHAL_HAVE_LOOPS
+	loopnez	a4, .Lloopend
+#endif /* XCHAL_HAVE_LOOPS */
+.Lloop:
+	bltu	a2, a3, .Lzerobit
+	sub	a2, a2, a3
+.Lzerobit:
+	srli	a3, a3, 1
+#if !XCHAL_HAVE_LOOPS
+	addi	a4, a4, -1
+	bnez	a4, .Lloop
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lloopend:
+
+.Lspecial:
+	bltu	a2, a3, .Lreturn
+	sub	a2, a2, a3	/* subtract again if udividend >= udivisor */
+.Lreturn:
+	bgez	a7, .Lpositive
+	neg	a2, a2		/* if (dividend < 0), return -udividend */
+.Lpositive:
+	leaf_return
+
+.Lle_one:
+	bnez	a3, .Lreturn0
+
+	/* Divide by zero: Use an illegal instruction to force an exception.
+	   The subsequent "DIV0" string can be recognized by the exception
+	   handler to identify the real cause of the exception.  */
+	ill
+	.ascii	"DIV0"
+
+.Lreturn0:
+	movi	a2, 0
+#endif /* XCHAL_HAVE_DIV32 */
+	leaf_return
+	.size	__modsi3, . - __modsi3
+
+#endif /* L_modsi3 */
+
+
+#ifdef __XTENSA_EB__
+#define uh a2
+#define ul a3
+#else
+#define uh a3
+#define ul a2
+#endif /* __XTENSA_EB__ */
+
+
+#ifdef L_ashldi3
+	.align	4
+	.global	__ashldi3
+	.type	__ashldi3, @function
+__ashldi3:
+	leaf_entry sp, 16
+	ssl	a4
+	bgei	a4, 32, .Llow_only
+	src	uh, uh, ul
+	sll	ul, ul
+	leaf_return
+
+.Llow_only:
+	sll	uh, ul
+	movi	ul, 0
+	leaf_return
+	.size	__ashldi3, . - __ashldi3
+
+#endif /* L_ashldi3 */
+
+
+#ifdef L_ashrdi3
+	.align	4
+	.global	__ashrdi3
+	.type	__ashrdi3, @function
+__ashrdi3:
+	leaf_entry sp, 16
+	ssr	a4
+	bgei	a4, 32, .Lhigh_only
+	src	ul, uh, ul
+	sra	uh, uh
+	leaf_return
+
+.Lhigh_only:
+	sra	ul, uh
+	srai	uh, uh, 31
+	leaf_return
+	.size	__ashrdi3, . - __ashrdi3
+
+#endif /* L_ashrdi3 */
+
+
+#ifdef L_lshrdi3
+	.align	4
+	.global	__lshrdi3
+	.type	__lshrdi3, @function
+__lshrdi3:
+	leaf_entry sp, 16
+	ssr	a4
+	bgei	a4, 32, .Lhigh_only1
+	src	ul, uh, ul
+	srl	uh, uh
+	leaf_return
+
+.Lhigh_only1:
+	srl	ul, uh
+	movi	uh, 0
+	leaf_return
+	.size	__lshrdi3, . - __lshrdi3
+
+#endif /* L_lshrdi3 */
+
+
+#include "ieee754-df.S"
+#include "ieee754-sf.S"
diff --git a/libgcc/config/xtensa/t-xtensa b/libgcc/config/xtensa/t-xtensa
index 7d9e9db0487..5bcc0946243 100644
--- a/libgcc/config/xtensa/t-xtensa
+++ b/libgcc/config/xtensa/t-xtensa
@@ -1,2 +1,14 @@
+LIB1ASMSRC = xtensa/lib1funcs.S
+LIB1ASMFUNCS = _mulsi3 _divsi3 _modsi3 _udivsi3 _umodsi3 \
+	_umulsidi3 _clz _clzsi2 _ctzsi2 _ffssi2 \
+	_ashldi3 _ashrdi3 _lshrdi3 \
+	_negsf2 _addsubsf3 _mulsf3 _divsf3 _cmpsf2 _fixsfsi _fixsfdi \
+	_fixunssfsi _fixunssfdi _floatsisf _floatunsisf \
+	_floatdisf _floatundisf \
+	_negdf2 _addsubdf3 _muldf3 _divdf3 _cmpdf2 _fixdfsi _fixdfdi \
+	_fixunsdfsi _fixunsdfdi _floatsidf _floatunsidf \
+	_floatdidf _floatundidf \
+	_truncdfsf2 _extendsfdf2
+
 LIB2ADDEH = $(srcdir)/config/xtensa/unwind-dw2-xtensa.c \
    $(srcdir)/unwind-dw2-fde.c $(srcdir)/unwind-sjlj.c $(srcdir)/unwind-c.c
author	ro <ro@138bc75d-0d04-0410-961f-82ee72b054a4>	2011-11-02 15:03:19 +0000
committer	ro <ro@138bc75d-0d04-0410-961f-82ee72b054a4>	2011-11-02 15:03:19 +0000
commit	9213d2eb44a8b9bcc432b57e246d9b52d5bdc949 (patch)
tree	bfbde9a54f663fb7556b9dacd07709ef97c1961c /libgcc
parent	237490bf10db39b859bd28598ff64f1bd2c84421 (diff)
download	ppe42-gcc-9213d2eb44a8b9bcc432b57e246d9b52d5bdc949.tar.gz ppe42-gcc-9213d2eb44a8b9bcc432b57e246d9b52d5bdc949.zip