diff options
Diffstat (limited to 'llvm/test')
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll | 36 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-intrinsics-x86.ll | 24 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-vinsertf128.ll | 42 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll | 52 |
4 files changed, 47 insertions, 107 deletions
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index 8d04c16879a..a5ad7ba2616 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -1,5 +1,41 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s +; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. + +define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1: + ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone + +define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1: + ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone + +define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1: + ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1) + ret <8 x i32> %res +} + +; Verify that high bits of the immediate are masked off. This should be the equivalent +; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's +; not a vinsertf128 $1. +define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: + ; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone + define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK: vblendpd %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll index 3716cf84989..96d80ea7ae6 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -2187,30 +2187,6 @@ define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) { declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone -define <4 x double> @test_x86_avx_vinsertf128_pd_256(<4 x double> %a0, <2 x double> %a1) { - ; CHECK: vinsertf128 - %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] - ret <4 x double> %res -} -declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone - - -define <8 x float> @test_x86_avx_vinsertf128_ps_256(<8 x float> %a0, <4 x float> %a1) { - ; CHECK: vinsertf128 - %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] - ret <8 x float> %res -} -declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone - - -define <8 x i32> @test_x86_avx_vinsertf128_si_256(<8 x i32> %a0, <4 x i32> %a1) { - ; CHECK: vinsertf128 - %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone - - define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK: vperm2f128 %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll index d0f8f4ebaea..38389de7a8a 100644 --- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll +++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=CHECK-SSE %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s +; CHECK-LABEL: A: ; CHECK-NOT: vunpck ; CHECK: vinsertf128 $1 define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp { @@ -9,6 +9,7 @@ entry: ret <8 x float> %shuffle } +; CHECK-LABEL: B: ; CHECK-NOT: vunpck ; CHECK: vinsertf128 $1 define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp { @@ -22,7 +23,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone ; Just check that no crash happens -; CHECK-SSE: _insert_crash +; CHECK-LABEL: _insert_crash: define void @insert_crash() nounwind { allocas: %v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> @@ -39,7 +40,7 @@ allocas: ;; DAG Combine must remove useless vinsertf128 instructions -; CHECK: DAGCombineA +; CHECK-LABEL: DAGCombineA: ; CHECK-NOT: vinsertf128 $1 define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly { %1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -47,7 +48,7 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly { ret <4 x i32> %2 } -; CHECK: DAGCombineB +; CHECK-LABEL: DAGCombineB: ; CHECK: vpaddd %xmm ; CHECK-NOT: vinsertf128 $1 ; CHECK: vpaddd %xmm @@ -57,14 +58,7 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly { ret <8 x i32> %2 } -; CHECK: insert_pd -define <4 x double> @insert_pd(<4 x double> %a0, <2 x double> %a1) { -; CHECK: vinsertf128 -%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 0) -ret <4 x double> %res -} - -; CHECK: insert_undef_pd +; CHECK-LABEL: insert_undef_pd: define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) { ; CHECK: vmovaps %ymm1, %ymm0 %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0) @@ -73,14 +67,7 @@ ret <4 x double> %res declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone -; CHECK: insert_ps -define <8 x float> @insert_ps(<8 x float> %a0, <4 x float> %a1) { -; CHECK: vinsertf128 -%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 0) -ret <8 x float> %res -} - -; CHECK: insert_undef_ps +; CHECK-LABEL: insert_undef_ps: define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) { ; CHECK: vmovaps %ymm1, %ymm0 %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0) @@ -89,14 +76,7 @@ ret <8 x float> %res declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone -; CHECK: insert_si -define <8 x i32> @insert_si(<8 x i32> %a0, <4 x i32> %a1) { -; CHECK: vinsertf128 -%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 0) -ret <8 x i32> %res -} - -; CHECK: insert_undef_si +; CHECK-LABEL: insert_undef_si: define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK: vmovaps %ymm1, %ymm0 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0) @@ -105,7 +85,7 @@ ret <8 x i32> %res declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone ; rdar://10643481 -; CHECK: vinsertf128_combine +; CHECK-LABEL: vinsertf128_combine: define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp { ; CHECK-NOT: vmovaps ; CHECK: vinsertf128 @@ -118,7 +98,7 @@ entry: } ; rdar://11076953 -; CHECK: vinsertf128_ucombine +; CHECK-LABEL: vinsertf128_ucombine: define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp { ; CHECK-NOT: vmovups ; CHECK: vinsertf128 diff --git a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll index a44d44d1b69..b337a80b84b 100644 --- a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -48,58 +48,6 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) { ; Merge two consecutive 16-byte subvector loads into a single 32-byte load ; if it's faster. -declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) - -; Use the vinsertf128 intrinsic to model source code -; that explicitly uses AVX intrinsics. -define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) { - ; CHECK-LABEL: combine_16_byte_loads - - ; SANDYB: vmovups - ; SANDYB-NEXT: vinsertf128 - ; SANDYB-NEXT: retq - - ; BTVER2: vmovups - ; BTVER2-NEXT: retq - - ; HASWELL: vmovups - ; HASWELL-NEXT: retq - - %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1 - %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 - %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 - %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> - %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) - ret <8 x float> %v3 -} - -; Swap the operands of the shufflevector and vinsertf128 to ensure that the -; pattern still matches. -define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) { - ; CHECK-LABEL: combine_16_byte_loads_swap - - ; SANDYB: vmovups - ; SANDYB-NEXT: vinsertf128 - ; SANDYB-NEXT: retq - - ; BTVER2: vmovups - ; BTVER2-NEXT: retq - - ; HASWELL: vmovups - ; HASWELL-NEXT: retq - - %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 - %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 - %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 - %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3> - %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0) - ret <8 x float> %v3 -} - -; Replace the vinsertf128 intrinsic with a shufflevector as might be -; expected from auto-vectorized code. define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic |

