diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 37 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-combining.ll | 62 |
2 files changed, 93 insertions, 6 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9b59b8a91e0..cb99c057a1f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6891,7 +6891,8 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, // Replace target shuffle mask elements with known undef/zero sentinels. static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, const APInt &KnownUndef, - const APInt &KnownZero) { + const APInt &KnownZero, + bool ResolveKnownZeros= true) { unsigned NumElts = Mask.size(); assert(KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"); @@ -6899,7 +6900,7 @@ static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, for (unsigned i = 0; i != NumElts; ++i) { if (KnownUndef[i]) Mask[i] = SM_SentinelUndef; - else if (KnownZero[i]) + else if (ResolveKnownZeros && KnownZero[i]) Mask[i] = SM_SentinelZero; } } @@ -33071,17 +33072,36 @@ static SDValue combineX86ShufflesRecursively( OpZero, DAG, Depth, false)) return SDValue(); - resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); - SmallVector<int, 64> Mask; SmallVector<SDValue, 16> Ops; // We don't need to merge masks if the root is empty. bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1); if (EmptyRoot) { + // Only resolve zeros if it will remove an input, otherwise we might end + // up in an infinite loop. + bool ResolveKnownZeros = true; + if (!OpZero.isNullValue()) { + APInt UsedInputs = APInt::getNullValue(OpInputs.size()); + for (int i = 0, e = OpMask.size(); i != e; ++i) { + int M = OpMask[i]; + if (OpUndef[i] || OpZero[i] || isUndefOrZero(M)) + continue; + UsedInputs.setBit(M / OpMask.size()); + if (UsedInputs.isAllOnesValue()) { + ResolveKnownZeros = false; + break; + } + } + } + resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero, + ResolveKnownZeros); + Mask = OpMask; Ops.append(OpInputs.begin(), OpInputs.end()); } else { + resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); + // Add the inputs to the Ops list, avoiding duplicates. Ops.append(SrcOps.begin(), SrcOps.end()); @@ -33216,13 +33236,18 @@ static SDValue combineX86ShufflesRecursively( // the remaining recursion depth. if (Ops.size() < (MaxRecursionDepth - Depth)) { for (int i = 0, e = Ops.size(); i < e; ++i) { + // For empty roots, we need to resolve zeroable elements before combining + // them with other shuffles. + SmallVector<int, 64> ResolvedMask = Mask; + if (EmptyRoot) + resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero); bool AllowVar = false; if (Ops[i].getNode()->hasOneUse() || SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) AllowVar = AllowVariableMask; if (SDValue Res = combineX86ShufflesRecursively( - Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask, - AllowVar, DAG, Subtarget)) + Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, + HasVariableMask, AllowVar, DAG, Subtarget)) return Res; } } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 871321e391e..b5dff70e234 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2981,3 +2981,65 @@ define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa %7 = insertelement <8 x i16> %6, i16 %b15, i32 7 ret <8 x i16> %7 } + +define void @PR43024() { +; SSE2-LABEL: PR43024: +; SSE2: # %bb.0: +; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSE2-NEXT: movaps %xmm0, (%rax) +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; SSE2-NEXT: addss %xmm0, %xmm1 +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm1 +; SSE2-NEXT: addss %xmm0, %xmm1 +; SSE2-NEXT: movss %xmm1, (%rax) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR43024: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSSE3-NEXT: movaps %xmm0, (%rax) +; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-NEXT: addss %xmm0, %xmm1 +; SSSE3-NEXT: xorps %xmm0, %xmm0 +; SSSE3-NEXT: addss %xmm0, %xmm1 +; SSSE3-NEXT: addss %xmm0, %xmm1 +; SSSE3-NEXT: movss %xmm1, (%rax) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR43024: +; SSE41: # %bb.0: +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSE41-NEXT: movaps %xmm0, (%rax) +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: movss %xmm1, (%rax) +; SSE41-NEXT: retq +; +; AVX-LABEL: PR43024: +; AVX: # %bb.0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; AVX-NEXT: vmovaps %xmm0, (%rax) +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovss %xmm0, (%rax) +; AVX-NEXT: retq + store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16 + %1 = load <4 x float>, <4 x float>* undef, align 16 + %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0> + %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %4 = fadd <4 x float> %2, %3 + %5 = fadd <4 x float> zeroinitializer, %4 + %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> + %7 = fadd <4 x float> %6, %5 + %8 = extractelement <4 x float> %7, i32 0 + store float %8, float* undef, align 8 + ret void +} |

