diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-20 10:18:01 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-07-20 10:18:01 +0000 |
| commit | e3b9ee0645a62b0be69c31f8a45cfd2195545998 (patch) | |
| tree | dd1f590f87c6eefaba0c436c14f71b1697138156 /clang/test | |
| parent | f345d40ae2a94a00cdb881934f6dae78e0dd0786 (diff) | |
| download | bcm5719-llvm-e3b9ee0645a62b0be69c31f8a45cfd2195545998.tar.gz bcm5719-llvm-e3b9ee0645a62b0be69c31f8a45cfd2195545998.zip | |
[X86][SSE] Reimplement SSE fp2si conversion intrinsics instead of using generic IR
D20859 and D20860 attempted to replace the SSE (V)CVTTPS2DQ and VCVTTPD2DQ truncating conversions with generic IR instead.
It turns out that the behaviour of these intrinsics is different enough from generic IR that this will cause problems, INF/NAN/out of range values are guaranteed to result in a 0x80000000 value - which plays havoc with constant folding which converts them to either zero or UNDEF. This is also an issue with the scalar implementations (which were already generic IR and what I was trying to match).
This patch changes both scalar and packed versions back to using x86-specific builtins.
It also deals with the other scalar conversion cases that are runtime rounding mode dependent and can have similar issues with constant folding.
Differential Revision: https://reviews.llvm.org/D22105
llvm-svn: 276102
Diffstat (limited to 'clang/test')
| -rw-r--r-- | clang/test/CodeGen/avx-builtins.c | 4 | ||||
| -rw-r--r-- | clang/test/CodeGen/builtins-x86.c | 8 | ||||
| -rw-r--r-- | clang/test/CodeGen/sse-builtins.c | 9 | ||||
| -rw-r--r-- | clang/test/CodeGen/sse2-builtins.c | 10 |
4 files changed, 17 insertions, 14 deletions
diff --git a/clang/test/CodeGen/avx-builtins.c b/clang/test/CodeGen/avx-builtins.c index bf3e8cc5db6..650e4d280ec 100644 --- a/clang/test/CodeGen/avx-builtins.c +++ b/clang/test/CodeGen/avx-builtins.c @@ -286,13 +286,13 @@ __m256d test_mm256_cvtps_pd(__m128 A) { __m128i test_mm256_cvttpd_epi32(__m256d A) { // CHECK-LABEL: test_mm256_cvttpd_epi32 - // CHECK: fptosi <4 x double> %{{.*}} to <4 x i32> + // CHECK: call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %{{.*}}) return _mm256_cvttpd_epi32(A); } __m256i test_mm256_cvttps_epi32(__m256 A) { // CHECK-LABEL: test_mm256_cvttps_epi32 - // CHECK: fptosi <8 x float> %{{.*}} to <8 x i32> + // CHECK: call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %{{.*}}) return _mm256_cvttps_epi32(A); } diff --git a/clang/test/CodeGen/builtins-x86.c b/clang/test/CodeGen/builtins-x86.c index 55e473fa4e4..8fa24e668f7 100644 --- a/clang/test/CodeGen/builtins-x86.c +++ b/clang/test/CodeGen/builtins-x86.c @@ -287,12 +287,14 @@ void f0() { tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i); tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f); tmp_i = __builtin_ia32_cvtss2si(tmp_V4f); + tmp_i = __builtin_ia32_cvttss2si(tmp_V4f); tmp_i = __builtin_ia32_rdtsc(); tmp_i = __builtin_ia32_rdtscp(&tmp_Ui); tmp_LLi = __builtin_ia32_rdpmc(tmp_i); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f); + tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f); #endif tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f); (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp); @@ -328,10 +330,14 @@ void f0() { tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d); tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i); tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d); + tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d); + tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d); + tmp_LLi = __builtin_ia32_cvttsd2si64(tmp_V2d); #endif tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f); + tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f); (void) __builtin_ia32_clflush(tmp_vCp); (void) __builtin_ia32_lfence(); (void) __builtin_ia32_mfence(); @@ -410,7 +416,9 @@ void f0() { tmp_V8f = __builtin_ia32_cvtdq2ps256(tmp_V8i); tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d); tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f); + tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d); tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d); + tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f); tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7); tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7); tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7); diff --git a/clang/test/CodeGen/sse-builtins.c b/clang/test/CodeGen/sse-builtins.c index a6c5c1a0a16..6f313b825c9 100644 --- a/clang/test/CodeGen/sse-builtins.c +++ b/clang/test/CodeGen/sse-builtins.c @@ -295,22 +295,19 @@ long long test_mm_cvtss_si64(__m128 A) { int test_mm_cvtt_ss2si(__m128 A) { // CHECK-LABEL: test_mm_cvtt_ss2si - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: fptosi float %{{.*}} to i32 + // CHECK: call i32 @llvm.x86.sse.cvttss2si(<4 x float> %{{.*}}) return _mm_cvtt_ss2si(A); } int test_mm_cvttss_si32(__m128 A) { // CHECK-LABEL: test_mm_cvttss_si32 - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: fptosi float %{{.*}} to i32 + // CHECK: call i32 @llvm.x86.sse.cvttss2si(<4 x float> %{{.*}}) return _mm_cvttss_si32(A); } long long test_mm_cvttss_si64(__m128 A) { // CHECK-LABEL: test_mm_cvttss_si64 - // CHECK: extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: fptosi float %{{.*}} to i64 + // CHECK: call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %{{.*}}) return _mm_cvttss_si64(A); } diff --git a/clang/test/CodeGen/sse2-builtins.c b/clang/test/CodeGen/sse2-builtins.c index b340d1a2755..ee9dca7d445 100644 --- a/clang/test/CodeGen/sse2-builtins.c +++ b/clang/test/CodeGen/sse2-builtins.c @@ -507,7 +507,7 @@ long long test_mm_cvtsd_si64(__m128d A) { __m128 test_mm_cvtsd_ss(__m128 A, __m128d B) { // CHECK-LABEL: test_mm_cvtsd_ss - // CHECK: fptrunc double %{{.*}} to float + // CHECK: call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %{{.*}}, <2 x double> %{{.*}}) return _mm_cvtsd_ss(A, B); } @@ -569,21 +569,19 @@ __m128i test_mm_cvttpd_epi32(__m128d A) { __m128i test_mm_cvttps_epi32(__m128 A) { // CHECK-LABEL: test_mm_cvttps_epi32 - // CHECK: fptosi <4 x float> %{{.*}} to <4 x i32> + // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %{{.*}}) return _mm_cvttps_epi32(A); } int test_mm_cvttsd_si32(__m128d A) { // CHECK-LABEL: test_mm_cvttsd_si32 - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: fptosi double %{{.*}} to i32 + // CHECK: call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %{{.*}}) return _mm_cvttsd_si32(A); } long long test_mm_cvttsd_si64(__m128d A) { // CHECK-LABEL: test_mm_cvttsd_si64 - // CHECK: extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: fptosi double %{{.*}} to i64 + // CHECK: call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %{{.*}}) return _mm_cvttsd_si64(A); } |

