[SLP] Fix vectorization for tree with trunc to minimum required bit width.

Summary: If the vectorized tree has truncate to minimum required bit width and the vector type of the cast operation after the truncation is the same as the vector type of the cast operands, count cost of the vector cast operation as 0, because this cast will be later removed. Also, if the vectorization tree root operations are integer cast operations, do not consider them as candidates for truncation. It will just create extra number of the same vector/scalar operations, which will be removed by instcombiner. Reviewers: RKSimon, spatel, mkuper, hfinkel, mssimpso Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D41948 llvm-svn: 322946
author: Alexey Bataev <a.bataev@hotmail.com> 2018-01-19 14:40:13 +0000
committer: Alexey Bataev <a.bataev@hotmail.com> 2018-01-19 14:40:13 +0000
commit: fa80c47c6a6741ad8536ef81034d901c70e934f5 (patch)
tree: 8c15691136c6ffa29e45928a74554b473b6483ed /llvm
parent: 545a20d90b8e5b89ad97e8da274bc05bf17920eb (diff)
download: bcm5719-llvm-fa80c47c6a6741ad8536ef81034d901c70e934f5.tar.gz
bcm5719-llvm-fa80c47c6a6741ad8536ef81034d901c70e934f5.zip
3 files changed, 33 insertions, 21 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 90f84e00b44..f748ba4b31b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2065,7 +2065,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
                                                          VL0->getType(), SrcTy, VL0);
 
       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
+      int VecCost = 0;
+      // Check if the values are candidates to demote.
+      if (!MinBWs.count(VL0) || VecTy != SrcVecTy)
+        VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
       return VecCost - ScalarCost;
     }
     case Instruction::FCmp:
@@ -4014,9 +4017,24 @@ void BoUpSLP::computeMinimumValueSizes() {
   // additional roots that require investigating in Roots.
   SmallVector<Value *, 32> ToDemote;
   SmallVector<Value *, 4> Roots;
-  for (auto *Root : TreeRoot)
+  for (auto *Root : TreeRoot) {
+    // Do not include top zext/sext/trunc operations to those to be demoted, it
+    // produces noise cast<vect>, trunc <vect>, exctract <vect>, cast <extract>
+    // sequence.
+    if (isa<Constant>(Root))
+      continue;
+    auto *I = dyn_cast<Instruction>(Root);
+    if (!I || !I->hasOneUse() || !Expr.count(I))
+      return;
+    if (isa<ZExtInst>(I) || isa<SExtInst>(I))
+      continue;
+    if (auto *TI = dyn_cast<TruncInst>(I)) {
+      Roots.push_back(TI->getOperand(0));
+      continue;
+    }
     if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
       return;
+  }
 
   // The maximum bit width required to represent all the values that can be
   // demoted without loss of precision. It would be safe to truncate the roots
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
index f3983d716d0..adfe77f89f5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
@@ -16,13 +16,10 @@ define { i64, i64 } @patatino(double %arg) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP14]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP11]], 1
 ; CHECK-NEXT:    ret { i64, i64 } [[TMP17]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
index 924422dd256..c9971b64978 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
@@ -4,18 +4,15 @@
 define <4 x i32> @sign_extend_v_v(<4 x i16> %lhs) {
 ; CHECK-LABEL: @sign_extend_v_v(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i16> [[LHS:%.*]], i32 0
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[VECEXT]] to i32
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[CONV]], i32 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i16> [[LHS]], i32 1
-; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[VECEXT1]] to i32
-; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[CONV2]], i32 1
-; CHECK-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x i16> [[LHS]], i32 2
-; CHECK-NEXT:    [[CONV5:%.*]] = sext i16 [[VECEXT4]] to i32
-; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[CONV5]], i32 2
-; CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x i16> [[LHS]], i32 3
-; CHECK-NEXT:    [[CONV8:%.*]] = sext i16 [[VECEXT7]] to i32
-; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[CONV8]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[LHS:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[TMP4]], i32 3
 ; CHECK-NEXT:    ret <4 x i32> [[VECINIT9]]
 ;
 entry:
author	Alexey Bataev <a.bataev@hotmail.com>	2018-01-19 14:40:13 +0000
committer	Alexey Bataev <a.bataev@hotmail.com>	2018-01-19 14:40:13 +0000
commit	fa80c47c6a6741ad8536ef81034d901c70e934f5 (patch)
tree	8c15691136c6ffa29e45928a74554b473b6483ed /llvm
parent	545a20d90b8e5b89ad97e8da274bc05bf17920eb (diff)
download	bcm5719-llvm-fa80c47c6a6741ad8536ef81034d901c70e934f5.tar.gz bcm5719-llvm-fa80c47c6a6741ad8536ef81034d901c70e934f5.zip