summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp137
-rw-r--r--llvm/test/CodeGen/X86/insertps-combine.ll16
-rw-r--r--llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll8
4 files changed, 145 insertions, 28 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e63c1568d07..fa1a5301bb7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23733,6 +23733,52 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
return true;
}
+/// Check a target shuffle mask's inputs to see if we can set any values to
+/// SM_SentinelZero - this is for elements that are known to be zero
+/// (not just zeroable) from their inputs.
+static bool setTargetShuffleZeroElements(SDValue N,
+ SmallVectorImpl<int> &Mask) {
+ bool IsUnary;
+ if (!isTargetShuffle(N.getOpcode()))
+ return false;
+ if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Mask,
+ IsUnary))
+ return false;
+
+ SDValue V1 = N.getOperand(0);
+ SDValue V2 = IsUnary ? V1 : N.getOperand(1);
+
+ while (V1.getOpcode() == ISD::BITCAST)
+ V1 = V1->getOperand(0);
+ while (V2.getOpcode() == ISD::BITCAST)
+ V2 = V2->getOperand(0);
+
+ for (int i = 0, Size = Mask.size(); i != Size; ++i) {
+ int M = Mask[i];
+
+ // Already decoded as SM_SentinelZero / SM_SentinelUndef.
+ if (M < 0)
+ continue;
+
+ SDValue V = M < Size ? V1 : V2;
+
+ // We are referencing an UNDEF input.
+ if (V.isUndef()) {
+ Mask[i] = SM_SentinelUndef;
+ continue;
+ }
+
+ // TODO - handle the Size != (int)V.getNumOperands() cases in future.
+ if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
+ continue;
+ if (!X86::isZeroNode(V.getOperand(M % Size)))
+ continue;
+ Mask[i] = SM_SentinelZero;
+ }
+
+ return true;
+}
+
/// \brief Try to combine x86 target specific shuffles.
static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -23806,6 +23852,96 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
}
+ // Attempt to merge blend(insertps(x,y),zero).
+ if (V0.getOpcode() == X86ISD::INSERTPS ||
+ V1.getOpcode() == X86ISD::INSERTPS) {
+ assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+
+ // Determine which elements are known to be zero.
+ SmallVector<int, 8> TargetMask;
+ if (!setTargetShuffleZeroElements(N, TargetMask))
+ return SDValue();
+
+ // Helper function to take inner insertps node and attempt to
+ // merge the blend with zero into its zero mask.
+ auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
+ if (V.getOpcode() != X86ISD::INSERTPS)
+ return SDValue();
+ SDValue Op0 = V.getOperand(0);
+ SDValue Op1 = V.getOperand(1);
+ SDValue Op2 = V.getOperand(2);
+ unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+
+ // Check each element of the blend node's target mask - must either
+ // be zeroable (and update the zero mask) or selects the element from
+ // the inner insertps node.
+ for (int i = 0; i != 4; ++i)
+ if (TargetMask[i] < 0)
+ InsertPSMask |= (1u << i);
+ else if (TargetMask[i] != (i + Offset))
+ return SDValue();
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+ };
+
+ if (SDValue V = MergeInsertPSAndBlend(V0, 0))
+ return V;
+ if (SDValue V = MergeInsertPSAndBlend(V1, 4))
+ return V;
+ }
+ return SDValue();
+ }
+ case X86ISD::INSERTPS: {
+ assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
+ SDValue Op0 = N.getOperand(0);
+ SDValue Op1 = N.getOperand(1);
+ SDValue Op2 = N.getOperand(2);
+ unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned DstIdx = (InsertPSMask >> 4) & 3;
+
+ // Attempt to merge insertps with an inner target shuffle node.
+ SmallVector<int, 8> TargetMask;
+ if (!setTargetShuffleZeroElements(Op0, TargetMask))
+ return SDValue();
+
+ bool Updated = false;
+ bool UseInput00 = false;
+ bool UseInput01 = false;
+ for (int i = 0; i != 4; ++i) {
+ int M = TargetMask[i];
+ if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
+ // No change if element is already zero or the inserted element.
+ continue;
+ } else if (M < 0) {
+ // If the target mask is undef/zero then we must zero the element.
+ InsertPSMask |= (1u << i);
+ Updated = true;
+ continue;
+ }
+
+ // The input vector element must be inline.
+ if (M != i && M != (i + 4))
+ return SDValue();
+
+ // Determine which inputs of the target shuffle we're using.
+ UseInput00 |= (0 <= M && M < 4);
+ UseInput01 |= (4 <= M);
+ }
+
+ // If we're not using both inputs of the target shuffle then use the
+ // referenced input directly.
+ if (UseInput00 && !UseInput01) {
+ Updated = true;
+ Op0 = Op0.getOperand(0);
+ } else if (!UseInput00 && UseInput01) {
+ Updated = true;
+ Op0 = Op0.getOperand(1);
+ }
+
+ if (Updated)
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
+ DAG.getConstant(InsertPSMask, DL, MVT::i8));
+
return SDValue();
}
default:
@@ -28163,6 +28299,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
+ case X86ISD::INSERTPS:
case X86ISD::PALIGNR:
case X86ISD::BLENDI:
case X86ISD::UNPCKH:
diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll
index f2596b6347b..690707b6870 100644
--- a/llvm/test/CodeGen/X86/insertps-combine.ll
+++ b/llvm/test/CodeGen/X86/insertps-combine.ll
@@ -6,16 +6,12 @@
define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
; SSE-LABEL: shuffle_v4f32_0z27:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z27:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
@@ -50,16 +46,12 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
; SSE-LABEL: shuffle_v4f32_0z24:
; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm2, %xmm2
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z24:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %xyzw, i32 0
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 2d6d3b6a0fb..5c6efe6eb2c 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -159,15 +159,13 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
;
; SSE41-LABEL: merge_4f32_f32_012u:
; SSE41: # BB#0:
-; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_012u:
; AVX: # BB#0:
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
%ptr0 = getelementptr inbounds float, float* %ptr, i64 0
@@ -195,15 +193,13 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
;
; SSE41-LABEL: merge_4f32_f32_019u:
; SSE41: # BB#0:
-; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_019u:
; AVX: # BB#0:
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
%ptr0 = getelementptr inbounds float, float* %ptr, i64 0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 53dbb32235a..9187f3513d6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1080,15 +1080,11 @@ define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
; SSE41-LABEL: shuffle_v4f32_0zz6:
; SSE41: # BB#0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz6:
; AVX: # BB#0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
@@ -1129,15 +1125,11 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
; SSE41-LABEL: shuffle_v4f32_0z24:
; SSE41: # BB#0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z24:
; AVX: # BB#0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
%shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
OpenPOWER on IntegriCloud