summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2015-03-10 16:08:36 +0000
committerSanjay Patel <spatel@rotateright.com>2015-03-10 16:08:36 +0000
commit19792fb2704439766356e9d86d171ea8b7d815bd (patch)
tree8017f6aa7d50ef4c9e32349079478b3aa8fbf040 /llvm/test
parentc98950671d826aea2ad53a5951cc0061b3c761bc (diff)
downloadbcm5719-llvm-19792fb2704439766356e9d86d171ea8b7d815bd.tar.gz
bcm5719-llvm-19792fb2704439766356e9d86d171ea8b7d815bd.zip
[X86, AVX] replace vinsertf128 intrinsics with generic shuffles
We want to replace as much custom x86 shuffling via intrinsics as possible because pushing the code down the generic shuffle optimization path allows for better codegen and less complexity in LLVM. This is the sibling patch for the Clang half of this change: http://reviews.llvm.org/D8088 Differential Revision: http://reviews.llvm.org/D8086 llvm-svn: 231794
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll36
-rw-r--r--llvm/test/CodeGen/X86/avx-intrinsics-x86.ll24
-rw-r--r--llvm/test/CodeGen/X86/avx-vinsertf128.ll42
-rw-r--r--llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll52
4 files changed, 47 insertions, 107 deletions
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 8d04c16879a..a5ad7ba2616 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -1,5 +1,41 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
+
+define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
+ ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
+ ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+ %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
+
+define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
+ ; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
+ ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+ %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
+ ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
+ ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+ %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
+ ret <8 x i32> %res
+}
+
+; Verify that high bits of the immediate are masked off. This should be the equivalent
+; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
+; not a vinsertf128 $1.
+define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
+ ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
+ ; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+ %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
+
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK: vblendpd
%res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
index 3716cf84989..96d80ea7ae6 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2187,30 +2187,6 @@ define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) {
declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
-define <4 x double> @test_x86_avx_vinsertf128_pd_256(<4 x double> %a0, <2 x double> %a1) {
- ; CHECK: vinsertf128
- %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vinsertf128_ps_256(<8 x float> %a0, <4 x float> %a1) {
- ; CHECK: vinsertf128
- %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
- ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_vinsertf128_si_256(<8 x i32> %a0, <4 x i32> %a1) {
- ; CHECK: vinsertf128
- %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
-
-
define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK: vperm2f128
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll
index d0f8f4ebaea..38389de7a8a 100644
--- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll
+++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=CHECK-SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
+; CHECK-LABEL: A:
; CHECK-NOT: vunpck
; CHECK: vinsertf128 $1
define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
@@ -9,6 +9,7 @@ entry:
ret <8 x float> %shuffle
}
+; CHECK-LABEL: B:
; CHECK-NOT: vunpck
; CHECK: vinsertf128 $1
define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
@@ -22,7 +23,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
; Just check that no crash happens
-; CHECK-SSE: _insert_crash
+; CHECK-LABEL: _insert_crash:
define void @insert_crash() nounwind {
allocas:
%v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -39,7 +40,7 @@ allocas:
;; DAG Combine must remove useless vinsertf128 instructions
-; CHECK: DAGCombineA
+; CHECK-LABEL: DAGCombineA:
; CHECK-NOT: vinsertf128 $1
define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
%1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -47,7 +48,7 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
ret <4 x i32> %2
}
-; CHECK: DAGCombineB
+; CHECK-LABEL: DAGCombineB:
; CHECK: vpaddd %xmm
; CHECK-NOT: vinsertf128 $1
; CHECK: vpaddd %xmm
@@ -57,14 +58,7 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
ret <8 x i32> %2
}
-; CHECK: insert_pd
-define <4 x double> @insert_pd(<4 x double> %a0, <2 x double> %a1) {
-; CHECK: vinsertf128
-%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 0)
-ret <4 x double> %res
-}
-
-; CHECK: insert_undef_pd
+; CHECK-LABEL: insert_undef_pd:
define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
; CHECK: vmovaps %ymm1, %ymm0
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
@@ -73,14 +67,7 @@ ret <4 x double> %res
declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-; CHECK: insert_ps
-define <8 x float> @insert_ps(<8 x float> %a0, <4 x float> %a1) {
-; CHECK: vinsertf128
-%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 0)
-ret <8 x float> %res
-}
-
-; CHECK: insert_undef_ps
+; CHECK-LABEL: insert_undef_ps:
define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
; CHECK: vmovaps %ymm1, %ymm0
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
@@ -89,14 +76,7 @@ ret <8 x float> %res
declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
-; CHECK: insert_si
-define <8 x i32> @insert_si(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK: vinsertf128
-%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 0)
-ret <8 x i32> %res
-}
-
-; CHECK: insert_undef_si
+; CHECK-LABEL: insert_undef_si:
define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
; CHECK: vmovaps %ymm1, %ymm0
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
@@ -105,7 +85,7 @@ ret <8 x i32> %res
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
; rdar://10643481
-; CHECK: vinsertf128_combine
+; CHECK-LABEL: vinsertf128_combine:
define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp {
; CHECK-NOT: vmovaps
; CHECK: vinsertf128
@@ -118,7 +98,7 @@ entry:
}
; rdar://11076953
-; CHECK: vinsertf128_ucombine
+; CHECK-LABEL: vinsertf128_ucombine:
define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp {
; CHECK-NOT: vmovups
; CHECK: vinsertf128
diff --git a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll
index a44d44d1b69..b337a80b84b 100644
--- a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll
+++ b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -48,58 +48,6 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
; Merge two consecutive 16-byte subvector loads into a single 32-byte load
; if it's faster.
-declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
-
-; Use the vinsertf128 intrinsic to model source code
-; that explicitly uses AVX intrinsics.
-define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
- ; CHECK-LABEL: combine_16_byte_loads
-
- ; SANDYB: vmovups
- ; SANDYB-NEXT: vinsertf128
- ; SANDYB-NEXT: retq
-
- ; BTVER2: vmovups
- ; BTVER2-NEXT: retq
-
- ; HASWELL: vmovups
- ; HASWELL-NEXT: retq
-
- %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
- %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
- %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
- %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
- %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
- %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
- ret <8 x float> %v3
-}
-
-; Swap the operands of the shufflevector and vinsertf128 to ensure that the
-; pattern still matches.
-define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
- ; CHECK-LABEL: combine_16_byte_loads_swap
-
- ; SANDYB: vmovups
- ; SANDYB-NEXT: vinsertf128
- ; SANDYB-NEXT: retq
-
- ; BTVER2: vmovups
- ; BTVER2-NEXT: retq
-
- ; HASWELL: vmovups
- ; HASWELL-NEXT: retq
-
- %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
- %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
- %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
- %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
- %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
- %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
- ret <8 x float> %v3
-}
-
-; Replace the vinsertf128 intrinsic with a shufflevector as might be
-; expected from auto-vectorized code.
define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
; CHECK-LABEL: combine_16_byte_loads_no_intrinsic
OpenPOWER on IntegriCloud