summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2019-01-11 14:27:59 +0000
committerSanjay Patel <spatel@rotateright.com>2019-01-11 14:27:59 +0000
commit40cd4b77e9a672487f11576ea720aeae86924948 (patch)
tree8fa05e4b510e4639a921ef32fba827092247c32d /llvm
parentfb909207c6789020015d5537cdb7fae7d8883ff7 (diff)
downloadbcm5719-llvm-40cd4b77e9a672487f11576ea720aeae86924948.tar.gz
bcm5719-llvm-40cd4b77e9a672487f11576ea720aeae86924948.zip
[x86] allow insert/extract when matching horizontal ops
Previously, we limited this transform to cases where the extraction into the build vector happens from vectors of the same type as the build vector, but that's not required. There's a slight potential regression seen in the AVX512 result for phadd -- we're using the 256-bit flavor of the instruction now even though the 128-bit subset is sufficient. The same problem could already be seen in the AVX2 result. Follow-up patches will attempt to narrow that back down. llvm-svn: 350928
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp16
-rw-r--r--llvm/test/CodeGen/X86/haddsub-undef.ll129
-rw-r--r--llvm/test/CodeGen/X86/phaddsub-undef.ll4
3 files changed, 39 insertions, 110 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c8ed78d0b4e..e074655f8b5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8307,10 +8307,20 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
SelectionDAG &DAG, unsigned HOpcode,
SDValue V0, SDValue V1) {
- // TODO: We should extract/insert to match the size of the build vector.
+ // If either input vector is not the same size as the build vector,
+ // extract/insert the low bits to the correct size.
+ // This is free (examples: zmm --> xmm, xmm --> ymm).
MVT VT = BV->getSimpleValueType(0);
- if (V0.getValueType() != VT || V1.getValueType() != VT)
- return SDValue();
+ unsigned Width = VT.getSizeInBits();
+ if (V0.getValueSizeInBits() > Width)
+ V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
+ else if (V0.getValueSizeInBits() < Width)
+ V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
+
+ if (V1.getValueSizeInBits() > Width)
+ V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
+ else if (V1.getValueSizeInBits() < Width)
+ V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
}
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index c7cee4ec314..3d13c8f76a2 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -737,21 +737,11 @@ define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %
; SSE-NEXT: haddps %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-SLOW-LABEL: v8f32_inputs_v4f32_output_0101:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; AVX-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vaddss %xmm3, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
-;
-; AVX-FAST-LABEL: v8f32_inputs_v4f32_output_0101:
-; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; AVX-FAST-NEXT: vzeroupper
-; AVX-FAST-NEXT: retq
+; AVX-LABEL: v8f32_inputs_v4f32_output_0101:
+; AVX: # %bb.0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
%b0 = extractelement <8 x float> %b, i32 0
@@ -769,26 +759,11 @@ define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %
; SSE-NEXT: haddps %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-SLOW-LABEL: v8f32_input0_v4f32_output_0123:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
-; AVX-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
-;
-; AVX-FAST-LABEL: v8f32_input0_v4f32_output_0123:
-; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-FAST-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
-; AVX-FAST-NEXT: vzeroupper
-; AVX-FAST-NEXT: retq
+; AVX-LABEL: v8f32_input0_v4f32_output_0123:
+; AVX: # %bb.0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a0 = extractelement <8 x float> %a, i32 0
%a1 = extractelement <8 x float> %a, i32 1
%b2 = extractelement <4 x float> %b, i32 2
@@ -806,28 +781,11 @@ define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-SLOW-LABEL: v8f32_input1_v4f32_output_2301:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0
-; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
-;
-; AVX-FAST-LABEL: v8f32_input1_v4f32_output_2301:
-; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm0
-; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
-; AVX-FAST-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-FAST-NEXT: vzeroupper
-; AVX-FAST-NEXT: retq
+; AVX-LABEL: v8f32_input1_v4f32_output_2301:
+; AVX: # %bb.0:
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%a2 = extractelement <4 x float> %a, i32 2
%a3 = extractelement <4 x float> %a, i32 3
%b0 = extractelement <8 x float> %b, i32 0
@@ -847,14 +805,7 @@ define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %
;
; AVX-LABEL: v8f32_inputs_v4f32_output_2323:
; AVX: # %bb.0:
-; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%a2 = extractelement <8 x float> %a, i32 2
@@ -876,45 +827,21 @@ define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float
;
; AVX1-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vzeroupper
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
+; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
;
-; AVX512-SLOW-LABEL: v16f32_inputs_v4f32_output_0123:
-; AVX512-SLOW: # %bb.0:
-; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1
-; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512-SLOW-NEXT: vzeroupper
-; AVX512-SLOW-NEXT: retq
-;
-; AVX512-FAST-LABEL: v16f32_inputs_v4f32_output_0123:
-; AVX512-FAST: # %bb.0:
-; AVX512-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-FAST-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX512-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX512-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
-; AVX512-FAST-NEXT: vzeroupper
-; AVX512-FAST-NEXT: retq
+; AVX512-LABEL: v16f32_inputs_v4f32_output_0123:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vhaddps %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%a0 = extractelement <16 x float> %a, i32 0
%a1 = extractelement <16 x float> %a, i32 1
%b2 = extractelement <16 x float> %b, i32 2
@@ -944,15 +871,7 @@ define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float
;
; AVX512-LABEL: v16f32_inputs_v8f32_output_4567:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT: vaddss %xmm1, %xmm3, %xmm1
-; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%a4 = extractelement <16 x float> %a, i32 4
%a5 = extractelement <16 x float> %a, i32 5
diff --git a/llvm/test/CodeGen/X86/phaddsub-undef.ll b/llvm/test/CodeGen/X86/phaddsub-undef.ll
index f763dbe4a7a..fb10135e852 100644
--- a/llvm/test/CodeGen/X86/phaddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/phaddsub-undef.ll
@@ -176,7 +176,7 @@ define <16 x i32> @test16_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
;
; AVX512-LABEL: test16_v16i32_undef:
; AVX512: # %bb.0:
-; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: retq
%vecext = extractelement <16 x i32> %a, i32 0
%vecext1 = extractelement <16 x i32> %a, i32 1
@@ -252,7 +252,7 @@ define <16 x i32> @test17_v16i32_undef(<16 x i32> %a, <16 x i32> %b) {
; AVX512-LABEL: test17_v16i32_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%vecext = extractelement <16 x i32> %a, i32 0
%vecext1 = extractelement <16 x i32> %a, i32 1
OpenPOWER on IntegriCloud