[DAGCombiner] Fix infinite loop in vector mul/shl combining

We have the following DAGCombiner transformations: (mul (shl X, c1), c2) -> (mul X, c2 << c1) (mul (shl X, C), Y) -> (shl (mul X, Y), C) (shl (mul x, c1), c2) -> (mul x, c1 << c2) Usually the constant shift is optimised by SelectionDAG::getNode when it is constructed, by SelectionDAG::FoldConstantArithmetic, but when we're dealing with vectors and one of those vector constants contains an undef element FoldConstantArithmetic does not fold and we enter an infinite loop. Fix this by making FoldConstantArithmetic use getNode to decide how to fold each vector element, the same as FoldConstantVectorArithmetic does, and rather than adding the constant shift to the work list instead only apply the transformation if it's already been folded into a constant, as if it's not we're going to loop endlessly. Additionally add missing NoOpaques to one of those transformations, which I noticed when writing the tests for this. Differential Revision: https://reviews.llvm.org/D26605 llvm-svn: 287766
author: John Brawn <john.brawn@arm.com> 2016-11-23 16:05:51 +0000
committer: John Brawn <john.brawn@arm.com> 2016-11-23 16:05:51 +0000
commit: 150addb45cb425d96bdcb1ccc53e0f9a56b15e8b (patch)
tree: cf4370a1fbe33960fcd6ef474974dce6517a047e
parent: 10fc3cfc637695cb1d4ec946a2327c886859b996 (diff)
download: bcm5719-llvm-150addb45cb425d96bdcb1ccc53e0f9a56b15e8b.tar.gz
bcm5719-llvm-150addb45cb425d96bdcb1ccc53e0f9a56b15e8b.zip
4 files changed, 134 insertions, 22 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4890441e37c..b3813ba82ce 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2157,11 +2157,11 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
 
   // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
   if (N0.getOpcode() == ISD::SHL &&
-      isConstantOrConstantVector(N1) &&
-      isConstantOrConstantVector(N0.getOperand(1))) {
+      isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
+      isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
     SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1));
-    AddToWorklist(C3.getNode());
-    return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
+    if (isConstantOrConstantVector(C3))
+      return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
   }
 
   // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
@@ -4714,8 +4714,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       isConstantOrConstantVector(N1, /* No Opaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
     SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
-    AddToWorklist(Shl.getNode());
-    return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
+    if (isConstantOrConstantVector(Shl))
+      return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
   }
 
   if (N1C && !N1C->isOpaque())
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f2bb3795a2d..71c83ae5c0f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3498,25 +3498,22 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   EVT SVT = VT.getScalarType();
   SmallVector<SDValue, 4> Outputs;
   for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) {
-    ConstantSDNode *V1 = dyn_cast<ConstantSDNode>(BV1->getOperand(I));
-    ConstantSDNode *V2 = dyn_cast<ConstantSDNode>(BV2->getOperand(I));
-    if (!V1 || !V2) // Not a constant, bail.
-      return SDValue();
-
-    if (V1->isOpaque() || V2->isOpaque())
-      return SDValue();
+    SDValue V1 = BV1->getOperand(I);
+    SDValue V2 = BV2->getOperand(I);
 
     // Avoid BUILD_VECTOR nodes that perform implicit truncation.
-    // FIXME: This is valid and could be handled by truncating the APInts.
+    // FIXME: This is valid and could be handled by truncation.
     if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT)
       return SDValue();
 
     // Fold one vector element.
-    std::pair<APInt, bool> Folded = FoldValue(Opcode, V1->getAPIntValue(),
-                                              V2->getAPIntValue());
-    if (!Folded.second)
+    SDValue ScalarResult = getNode(Opcode, DL, SVT, V1, V2);
+
+    // Scalar folding only succeeded if the result is a constant or UNDEF.
+    if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
+        ScalarResult.getOpcode() != ISD::ConstantFP)
       return SDValue();
-    Outputs.push_back(getConstant(Folded.first, DL, SVT));
+    Outputs.push_back(ScalarResult);
   }
 
   assert(VT.getVectorNumElements() == Outputs.size() &&
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll b/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll
new file mode 100644
index 00000000000..00c50059406
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dag-combine-mul-shl.ll
@@ -0,0 +1,117 @@
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+; CHECK-LABEL: fn1_vector:
+; CHECK:      adrp x[[BASE:[0-9]+]], .LCP
+; CHECK-NEXT: ldr q[[NUM:[0-9]+]], [x[[BASE]],
+; CHECK-NEXT: mul v0.16b, v0.16b, v[[NUM]].16b
+; CHECK-NEXT: ret
+define <16 x i8> @fn1_vector(<16 x i8> %arg) {
+entry:
+  %shl = shl <16 x i8> %arg, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %mul = mul <16 x i8> %shl, <i8 0, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+  ret <16 x i8> %mul
+}
+
+; CHECK-LABEL: fn2_vector:
+; CHECK:      adrp x[[BASE:[0-9]+]], .LCP
+; CHECK-NEXT: ldr q[[NUM:[0-9]+]], [x[[BASE]],
+; CHECK-NEXT: mul v0.16b, v0.16b, v[[NUM]].16b
+; CHECK-NEXT: ret
+define <16 x i8> @fn2_vector(<16 x i8> %arg) {
+entry:
+  %mul = mul <16 x i8> %arg, <i8 0, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+  %shl = shl <16 x i8> %mul, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %shl
+}
+
+; CHECK-LABEL: fn1_vector_undef:
+; CHECK:      adrp x[[BASE:[0-9]+]], .LCP
+; CHECK-NEXT: ldr q[[NUM:[0-9]+]], [x[[BASE]],
+; CHECK-NEXT: mul v0.16b, v0.16b, v[[NUM]].16b
+; CHECK-NEXT: ret
+define <16 x i8> @fn1_vector_undef(<16 x i8> %arg) {
+entry:
+  %shl = shl <16 x i8> %arg, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  %mul = mul <16 x i8> %shl, <i8 undef, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+  ret <16 x i8> %mul
+}
+
+; CHECK-LABEL: fn2_vector_undef:
+; CHECK:      adrp x[[BASE:[0-9]+]], .LCP
+; CHECK-NEXT: ldr q[[NUM:[0-9]+]], [x[[BASE]],
+; CHECK-NEXT: mul v0.16b, v0.16b, v[[NUM]].16b
+; CHECK-NEXT: ret
+define <16 x i8> @fn2_vector_undef(<16 x i8> %arg) {
+entry:
+  %mul = mul <16 x i8> %arg, <i8 undef, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+  %shl = shl <16 x i8> %mul, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %shl
+}
+
+; CHECK-LABEL: fn1_scalar:
+; CHECK:      mov w[[REG:[0-9]+]], #1664
+; CHECK-NEXT: mul w0, w0, w[[REG]]
+; CHECK-NEXT: ret
+define i32 @fn1_scalar(i32 %arg) {
+entry:
+  %shl = shl i32 %arg, 7
+  %mul = mul i32 %shl, 13
+  ret i32 %mul
+}
+
+; CHECK-LABEL: fn2_scalar:
+; CHECK:      mov w[[REG:[0-9]+]], #1664
+; CHECK-NEXT: mul w0, w0, w[[REG]]
+; CHECK-NEXT: ret
+define i32 @fn2_scalar(i32 %arg) {
+entry:
+  %mul = mul i32 %arg, 13
+  %shl = shl i32 %mul, 7
+  ret i32 %shl
+}
+
+; CHECK-LABEL: fn1_scalar_undef:
+; CHECK:      mov w0
+; CHECK-NEXT: ret
+define i32 @fn1_scalar_undef(i32 %arg) {
+entry:
+  %shl = shl i32 %arg, 7
+  %mul = mul i32 %shl, undef
+  ret i32 %mul
+}
+
+; CHECK-LABEL: fn2_scalar_undef:
+; CHECK:      mov w0
+; CHECK-NEXT: ret
+define i32 @fn2_scalar_undef(i32 %arg) {
+entry:
+  %mul = mul i32 %arg, undef
+  %shl = shl i32 %mul, 7
+  ret i32 %shl
+}
+
+; CHECK-LABEL: fn1_scalar_opaque:
+; CHECK:      mov w[[REG:[0-9]+]], #13
+; CHECK-NEXT: mul w[[REG]], w0, w[[REG]]
+; CHECK-NEXT: lsl w0, w[[REG]], #7
+; CHECK-NEXT: ret
+define i32 @fn1_scalar_opaque(i32 %arg) {
+entry:
+  %bitcast = bitcast i32 13 to i32
+  %shl = shl i32 %arg, 7
+  %mul = mul i32 %shl, %bitcast
+  ret i32 %mul
+}
+
+; CHECK-LABEL: fn2_scalar_opaque:
+; CHECK:      mov w[[REG:[0-9]+]], #13
+; CHECK-NEXT: mul w[[REG]], w0, w[[REG]]
+; CHECK-NEXT: lsl w0, w[[REG]], #7
+; CHECK-NEXT: ret
+define i32 @fn2_scalar_opaque(i32 %arg) {
+entry:
+  %bitcast = bitcast i32 13 to i32
+  %mul = mul i32 %arg, %bitcast
+  %shl = shl i32 %mul, 7
+  ret i32 %shl
+}
diff --git a/llvm/test/CodeGen/X86/shift-pcmp.ll b/llvm/test/CodeGen/X86/shift-pcmp.ll
index 4945d6115db..adfd2f143d1 100644
--- a/llvm/test/CodeGen/X86/shift-pcmp.ll
+++ b/llvm/test/CodeGen/X86/shift-pcmp.ll
@@ -26,15 +26,13 @@ define <8 x i16> @bar(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-LABEL: bar:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpeqw %xmm1, %xmm0
-; SSE-NEXT:    psrlw $15, %xmm0
-; SSE-NEXT:    psllw $5, %xmm0
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: bar:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX-NEXT:    vpsllw $5, %xmm0, %xmm0
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
   %icmp = icmp eq <8 x i16> %a, %b
author	John Brawn <john.brawn@arm.com>	2016-11-23 16:05:51 +0000
committer	John Brawn <john.brawn@arm.com>	2016-11-23 16:05:51 +0000
commit	150addb45cb425d96bdcb1ccc53e0f9a56b15e8b (patch)
tree	cf4370a1fbe33960fcd6ef474974dce6517a047e
parent	10fc3cfc637695cb1d4ec946a2327c886859b996 (diff)
download	bcm5719-llvm-150addb45cb425d96bdcb1ccc53e0f9a56b15e8b.tar.gz bcm5719-llvm-150addb45cb425d96bdcb1ccc53e0f9a56b15e8b.zip