summaryrefslogtreecommitdiffstats
path: root/clang/lib
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2017-11-10 05:20:32 +0000
committerCraig Topper <craig.topper@intel.com>2017-11-10 05:20:32 +0000
commitb3d447356f237ff0feac92d1f3c1b2b518c2f853 (patch)
treeb8c44fd0936fd19fbfab4eb72daf78846f6eaa1e /clang/lib
parent8b689f4092acdbd663a82ed8efa90d433acd5d79 (diff)
downloadbcm5719-llvm-b3d447356f237ff0feac92d1f3c1b2b518c2f853.tar.gz
bcm5719-llvm-b3d447356f237ff0feac92d1f3c1b2b518c2f853.zip
[X86] Reduce the number of FMA builtins needed by the frontend by adding negates to operands of the fmadd and fmaddsub builtins.
The backend should be able to combine the negates to create fmsub, fnmadd, and fnmsub. faddsub converting to fsubadd still needs work I think, but should be very doable. This matches what we already do for the masked builtins. This only covers the packed builtins. Scalar builtins will be done after FMA4 is fixed. llvm-svn: 317873
Diffstat (limited to 'clang/lib')
-rw-r--r--clang/lib/Headers/fma4intrin.h32
-rw-r--r--clang/lib/Headers/fmaintrin.h32
2 files changed, 32 insertions, 32 deletions
diff --git a/clang/lib/Headers/fma4intrin.h b/clang/lib/Headers/fma4intrin.h
index 8431821bba5..cda20e7d810 100644
--- a/clang/lib/Headers/fma4intrin.h
+++ b/clang/lib/Headers/fma4intrin.h
@@ -60,13 +60,13 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -84,13 +84,13 @@ _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -108,13 +108,13 @@ _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -144,13 +144,13 @@ _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
@@ -168,37 +168,37 @@ _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
{
- return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+ return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
{
- return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+ return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
{
- return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+ return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
{
- return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+ return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
- return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+ return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
- return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+ return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
@@ -216,13 +216,13 @@ _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
- return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+ return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
- return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+ return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index 235931ccb6b..86a1198879b 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -58,13 +58,13 @@ _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -82,13 +82,13 @@ _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -106,13 +106,13 @@ _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
@@ -142,13 +142,13 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
- return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+ return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
- return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+ return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
@@ -166,37 +166,37 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
- return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+ return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
- return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+ return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
- return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+ return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
- return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+ return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
- return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+ return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
- return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+ return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
@@ -214,13 +214,13 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
- return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+ return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
- return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+ return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS
OpenPOWER on IntegriCloud