summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp37
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining.ll62
2 files changed, 93 insertions, 6 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9b59b8a91e0..cb99c057a1f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6891,7 +6891,8 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
// Replace target shuffle mask elements with known undef/zero sentinels.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
const APInt &KnownUndef,
- const APInt &KnownZero) {
+ const APInt &KnownZero,
+ bool ResolveKnownZeros= true) {
unsigned NumElts = Mask.size();
assert(KnownUndef.getBitWidth() == NumElts &&
KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
@@ -6899,7 +6900,7 @@ static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
for (unsigned i = 0; i != NumElts; ++i) {
if (KnownUndef[i])
Mask[i] = SM_SentinelUndef;
- else if (KnownZero[i])
+ else if (ResolveKnownZeros && KnownZero[i])
Mask[i] = SM_SentinelZero;
}
}
@@ -33071,17 +33072,36 @@ static SDValue combineX86ShufflesRecursively(
OpZero, DAG, Depth, false))
return SDValue();
- resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
-
SmallVector<int, 64> Mask;
SmallVector<SDValue, 16> Ops;
// We don't need to merge masks if the root is empty.
bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
if (EmptyRoot) {
+ // Only resolve zeros if it will remove an input, otherwise we might end
+ // up in an infinite loop.
+ bool ResolveKnownZeros = true;
+ if (!OpZero.isNullValue()) {
+ APInt UsedInputs = APInt::getNullValue(OpInputs.size());
+ for (int i = 0, e = OpMask.size(); i != e; ++i) {
+ int M = OpMask[i];
+ if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
+ continue;
+ UsedInputs.setBit(M / OpMask.size());
+ if (UsedInputs.isAllOnesValue()) {
+ ResolveKnownZeros = false;
+ break;
+ }
+ }
+ }
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
+ ResolveKnownZeros);
+
Mask = OpMask;
Ops.append(OpInputs.begin(), OpInputs.end());
} else {
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
+
// Add the inputs to the Ops list, avoiding duplicates.
Ops.append(SrcOps.begin(), SrcOps.end());
@@ -33216,13 +33236,18 @@ static SDValue combineX86ShufflesRecursively(
// the remaining recursion depth.
if (Ops.size() < (MaxRecursionDepth - Depth)) {
for (int i = 0, e = Ops.size(); i < e; ++i) {
+ // For empty roots, we need to resolve zeroable elements before combining
+ // them with other shuffles.
+ SmallVector<int, 64> ResolvedMask = Mask;
+ if (EmptyRoot)
+ resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
bool AllowVar = false;
if (Ops[i].getNode()->hasOneUse() ||
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
AllowVar = AllowVariableMask;
if (SDValue Res = combineX86ShufflesRecursively(
- Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
- AllowVar, DAG, Subtarget))
+ Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
+ HasVariableMask, AllowVar, DAG, Subtarget))
return Res;
}
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 871321e391e..b5dff70e234 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2981,3 +2981,65 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa
%7 = insertelement <8 x i16> %6, i16 %b15, i32 7
ret <8 x i16> %7
}
+
+define void @PR43024() {
+; SSE2-LABEL: PR43024:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE2-NEXT: movaps %xmm0, (%rax)
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; SSE2-NEXT: addss %xmm0, %xmm1
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: addss %xmm0, %xmm1
+; SSE2-NEXT: addss %xmm0, %xmm1
+; SSE2-NEXT: movss %xmm1, (%rax)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: PR43024:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSSE3-NEXT: movaps %xmm0, (%rax)
+; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSSE3-NEXT: addss %xmm0, %xmm1
+; SSSE3-NEXT: xorps %xmm0, %xmm0
+; SSSE3-NEXT: addss %xmm0, %xmm1
+; SSSE3-NEXT: addss %xmm0, %xmm1
+; SSSE3-NEXT: movss %xmm1, (%rax)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: PR43024:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE41-NEXT: movaps %xmm0, (%rax)
+; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: addss %xmm0, %xmm1
+; SSE41-NEXT: xorps %xmm0, %xmm0
+; SSE41-NEXT: addss %xmm0, %xmm1
+; SSE41-NEXT: addss %xmm0, %xmm1
+; SSE41-NEXT: movss %xmm1, (%rax)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: PR43024:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; AVX-NEXT: vmovaps %xmm0, (%rax)
+; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovss %xmm0, (%rax)
+; AVX-NEXT: retq
+ store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16
+ %1 = load <4 x float>, <4 x float>* undef, align 16
+ %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0>
+ %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %4 = fadd <4 x float> %2, %3
+ %5 = fadd <4 x float> zeroinitializer, %4
+ %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+ %7 = fadd <4 x float> %6, %5
+ %8 = extractelement <4 x float> %7, i32 0
+ store float %8, float* undef, align 8
+ ret void
+}
OpenPOWER on IntegriCloud