[LV] Exclude loop-invariant inputs from scalar cost computation.

Loop invariant operands do not need to be scalarized, as we are using the values outside the loop. We should ignore them when computing the scalarization overhead. Fixes PR41294 Reviewers: hsaito, rengolin, dcaballe, Ayal Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D59995 llvm-svn: 366030
author: Florian Hahn <flo@fhahn.com> 2019-07-14 20:12:36 +0000
committer: Florian Hahn <flo@fhahn.com> 2019-07-14 20:12:36 +0000
commit: 9428d95ce7f84844a076fe13219db96a78e3bd44 (patch)
tree: e447793846bef77b8186d9b38544ed00168491e1
parent: 8111807a03c7ecc340fe2d8497b422b09e111fe9 (diff)
download: bcm5719-llvm-9428d95ce7f84844a076fe13219db96a78e3bd44.tar.gz
bcm5719-llvm-9428d95ce7f84844a076fe13219db96a78e3bd44.zip
2 files changed, 151 insertions, 22 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c1bb43bc5bd..22cf9c7db94 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1179,7 +1179,7 @@ public:
   /// VF. Return the cost of the instruction, including scalarization overhead
   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
   /// scalarized -
-  // i.e. either vector version isn't available, or is too expensive.
+  /// i.e. either vector version isn't available, or is too expensive.
   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
 
 private:
@@ -1332,6 +1332,30 @@ private:
 
   DecisionList WideningDecisions;
 
+  /// Returns true if \p V is expected to be vectorized and it needs to be
+  /// extracted.
+  bool needsExtract(Value *V, unsigned VF) const {
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
+      return false;
+
+    // Assume we can vectorize V (and hence we need extraction) if the
+    // scalars are not computed yet. This can happen, because it is called
+    // via getScalarizationOverhead from setCostBasedWideningDecision, before
+    // the scalars are collected. That should be a safe assumption in most
+    // cases, because we check if the operands have vectorizable types
+    // beforehand in LoopVectorizationLegality.
+    return Scalars.find(VF) == Scalars.end() ||
+           !isScalarAfterVectorization(I, VF);
+  };
+
+  /// Returns a range containing only operands needing to be extracted.
+  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
+                                                   unsigned VF) {
+    return SmallVector<Value *, 4>(make_filter_range(
+        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
+  }
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -3125,8 +3149,11 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
     FMF = FPMO->getFastMathFlags();
 
-  SmallVector<Value *, 4> Operands(CI->arg_operands());
-  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
+  // Skip operands that do not require extraction/scalarization and do not incur
+  // any overhead.
+  return TTI.getIntrinsicInstrCost(
+      ID, CI->getType(), filterExtractingOperands(CI->arg_operands(), VF), FMF,
+      VF);
 }
 
 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
@@ -5346,15 +5373,6 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     return true;
   };
 
-  // Returns true if an operand that cannot be scalarized must be extracted
-  // from a vector. We will account for this scalarization overhead below. Note
-  // that the non-void predicated instructions are placed in their own blocks,
-  // and their return values are inserted into vectors. Thus, an extract would
-  // still be required.
-  auto needsExtract = [&](Instruction *I) -> bool {
-    return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
-  };
-
   // Compute the expected cost discount from scalarizing the entire expression
   // feeding the predicated instruction. We currently only consider expressions
   // that are single-use instruction chains.
@@ -5394,7 +5412,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
                "Instruction has non-scalar type");
         if (canBeScalarized(J))
           Worklist.push_back(J);
-        else if (needsExtract(J))
+        else if (needsExtract(J, VF))
           ScalarCost += TTI.getScalarizationOverhead(
                               ToVectorTy(J->getType(),VF), false, true);
       }
@@ -5684,16 +5702,18 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
     return Cost;
 
-  if (CallInst *CI = dyn_cast<CallInst>(I)) {
-    SmallVector<const Value *, 4> Operands(CI->arg_operands());
-    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
-  } else if (!isa<StoreInst>(I) ||
-             !TTI.supportsEfficientVectorElementLoadStore()) {
-    SmallVector<const Value *, 4> Operands(I->operand_values());
-    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
-  }
+  // Some targets support efficient element stores.
+  if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
+    return Cost;
 
-  return Cost;
+  // Collect operands to consider.
+  CallInst *CI = dyn_cast<CallInst>(I);
+  Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
+
+  // Skip operands that do not require extraction/scalarization and do not incur
+  // any overhead.
+  return Cost + TTI.getOperandsScalarizationOverhead(
+                    filterExtractingOperands(Ops, VF), VF);
 }
 
 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
new file mode 100644
index 00000000000..c3ad5b078ae
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -0,0 +1,109 @@
+; REQUIRES: asserts
+
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 %s -S | FileCheck --check-prefix=FORCED %s
+
+; Test case from PR41294.
+
+; Check scalar cost for extractvalue. The constant and loop invariant operands are free,
+; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2.
+
+; CM: LV: Scalar loop costs: 7.
+; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
+; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
+
+; Check that the extractvalue operands are actually free in vector code.
+
+; FORCED-LABEL: vector.body:                                      ; preds = %vector.body, %vector.ph
+; FORCED-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; FORCED-NEXT:    %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
+; FORCED-NEXT:    %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
+; FORCED-NEXT:    %0 = add i32 %index, 0
+; FORCED-NEXT:    %1 = extractvalue { i64, i64 } %sv, 0
+; FORCED-NEXT:    %2 = extractvalue { i64, i64 } %sv, 0
+; FORCED-NEXT:    %3 = insertelement <2 x i64> undef, i64 %1, i32 0
+; FORCED-NEXT:    %4 = insertelement <2 x i64> %3, i64 %2, i32 1
+; FORCED-NEXT:    %5 = extractvalue { i64, i64 } %sv, 1
+; FORCED-NEXT:    %6 = extractvalue { i64, i64 } %sv, 1
+; FORCED-NEXT:    %7 = insertelement <2 x i64> undef, i64 %5, i32 0
+; FORCED-NEXT:    %8 = insertelement <2 x i64> %7, i64 %6, i32 1
+; FORCED-NEXT:    %9 = getelementptr i64, i64* %dst, i32 %0
+; FORCED-NEXT:    %10 = add <2 x i64> %4, %8
+; FORCED-NEXT:    %11 = getelementptr i64, i64* %9, i32 0
+; FORCED-NEXT:    %12 = bitcast i64* %11 to <2 x i64>*
+; FORCED-NEXT:    store <2 x i64> %10, <2 x i64>* %12, align 4
+; FORCED-NEXT:    %index.next = add i32 %index, 2
+; FORCED-NEXT:    %13 = icmp eq i32 %index.next, 0
+; FORCED-NEXT:    br i1 %13, label %middle.block, label %vector.body, !llvm.loop !0
+
+define void @test1(i64* %dst, {i64, i64} %sv) {
+entry:
+  br label %loop.body
+
+loop.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ]
+  %a = extractvalue { i64, i64 } %sv, 0
+  %b = extractvalue { i64, i64 } %sv, 1
+  %addr = getelementptr i64, i64* %dst, i32 %iv
+  %add = add i64 %a, %b
+  store i64 %add, i64* %addr
+  %iv.next = add nsw i32 %iv, 1
+  %cond = icmp ne i32 %iv.next, 0
+  br i1 %cond, label %loop.body, label %exit
+
+exit:
+  ret void
+}
+
+
+; Similar to the test case above, but checks getVectorCallCost as well.
+declare float @pow(float, float) readnone nounwind
+
+; CM: LV: Scalar loop costs: 16.
+; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
+; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
+
+; FORCED-LABEL: define void @test_getVectorCallCost
+
+; FORCED-LABEL: vector.body:                                      ; preds = %vector.body, %vector.ph
+; FORCED-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; FORCED-NEXT:    %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
+; FORCED-NEXT:    %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
+; FORCED-NEXT:    %0 = add i32 %index, 0
+; FORCED-NEXT:    %1 = extractvalue { float, float } %sv, 0
+; FORCED-NEXT:    %2 = extractvalue { float, float } %sv, 0
+; FORCED-NEXT:    %3 = insertelement <2 x float> undef, float %1, i32 0
+; FORCED-NEXT:    %4 = insertelement <2 x float> %3, float %2, i32 1
+; FORCED-NEXT:    %5 = extractvalue { float, float } %sv, 1
+; FORCED-NEXT:    %6 = extractvalue { float, float } %sv, 1
+; FORCED-NEXT:    %7 = insertelement <2 x float> undef, float %5, i32 0
+; FORCED-NEXT:    %8 = insertelement <2 x float> %7, float %6, i32 1
+; FORCED-NEXT:    %9 = getelementptr float, float* %dst, i32 %0
+; FORCED-NEXT:    %10 = call <2 x float> @llvm.pow.v2f32(<2 x float> %4, <2 x float> %8)
+; FORCED-NEXT:    %11 = getelementptr float, float* %9, i32 0
+; FORCED-NEXT:    %12 = bitcast float* %11 to <2 x float>*
+; FORCED-NEXT:    store <2 x float> %10, <2 x float>* %12, align 4
+; FORCED-NEXT:    %index.next = add i32 %index, 2
+; FORCED-NEXT:    %13 = icmp eq i32 %index.next, 0
+; FORCED-NEXT:    br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4
+
+define void @test_getVectorCallCost(float* %dst, {float, float} %sv) {
+entry:
+  br label %loop.body
+
+loop.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ]
+  %a = extractvalue { float, float } %sv, 0
+  %b = extractvalue { float, float } %sv, 1
+  %addr = getelementptr float, float* %dst, i32 %iv
+  %p = call float @pow(float %a, float %b)
+  store float %p, float* %addr
+  %iv.next = add nsw i32 %iv, 1
+  %cond = icmp ne i32 %iv.next, 0
+  br i1 %cond, label %loop.body, label %exit
+
+exit:
+  ret void
+}
author	Florian Hahn <flo@fhahn.com>	2019-07-14 20:12:36 +0000
committer	Florian Hahn <flo@fhahn.com>	2019-07-14 20:12:36 +0000
commit	9428d95ce7f84844a076fe13219db96a78e3bd44 (patch)
tree	e447793846bef77b8186d9b38544ed00168491e1
parent	8111807a03c7ecc340fe2d8497b422b09e111fe9 (diff)
download	bcm5719-llvm-9428d95ce7f84844a076fe13219db96a78e3bd44.tar.gz bcm5719-llvm-9428d95ce7f84844a076fe13219db96a78e3bd44.zip