summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/CodeGen/BasicTTIImpl.h71
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp3
-rw-r--r--llvm/test/Analysis/CostModel/X86/reduction.ll2
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll8
4 files changed, 70 insertions, 14 deletions
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index effccee890c..334a720fb06 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -927,16 +927,71 @@ public:
unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) {
assert(Ty->isVectorTy() && "Expect a vector type");
+ Type *ScalarTy = Ty->getVectorElementType();
unsigned NumVecElts = Ty->getVectorNumElements();
unsigned NumReduxLevels = Log2_32(NumVecElts);
- unsigned ArithCost =
- NumReduxLevels *
- static_cast<T *>(this)->getArithmeticInstrCost(Opcode, Ty);
- // Assume the pairwise shuffles add a cost.
- unsigned ShuffleCost =
- NumReduxLevels * (IsPairwise + 1) *
- static_cast<T *>(this)
- ->getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
+ // Try to calculate arithmetic and shuffle op costs for reduction operations.
+ // We're assuming that reduction operation are performing the following way:
+ // 1. Non-pairwise reduction
+ // %val1 = shufflevector<n x t> %val, <n x t> %undef,
+ // <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
+ // \----------------v-------------/ \----------v------------/
+ // n/2 elements n/2 elements
+ // %red1 = op <n x t> %val, <n x t> val1
+ // After this operation we have a vector %red1 with only maningfull the
+ // first n/2 elements, the second n/2 elements are undefined and can be
+ // dropped. All other operations are actually working with the vector of
+ // length n/2, not n. though the real vector length is still n.
+ // %val2 = shufflevector<n x t> %red1, <n x t> %undef,
+ // <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
+ // \----------------v-------------/ \----------v------------/
+ // n/4 elements 3*n/4 elements
+ // %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
+ // length n/2, the resulting vector has length n/4 etc.
+ // 2. Pairwise reduction:
+ // Everything is the same except for an additional shuffle operation which
+ // is used to produce operands for pairwise kind of reductions.
+ // %val1 = shufflevector<n x t> %val, <n x t> %undef,
+ // <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
+ // \-------------v----------/ \----------v------------/
+ // n/2 elements n/2 elements
+ // %val2 = shufflevector<n x t> %val, <n x t> %undef,
+ // <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
+ // \-------------v----------/ \----------v------------/
+ // n/2 elements n/2 elements
+ // %red1 = op <n x t> %val1, <n x t> val2
+ // Again, the operation is performed on <n x t> vector, but the resulting
+ // vector %red1 is <n/2 x t> vector.
+ //
+ // The cost model should take into account that the actual length of the
+ // vector is reduced on each iteration.
+ unsigned ArithCost = 0;
+ unsigned ShuffleCost = 0;
+ auto *ConcreteTTI = static_cast<T *>(this);
+ std::pair<unsigned, MVT> LT =
+ ConcreteTTI->getTLI()->getTypeLegalizationCost(DL, Ty);
+ unsigned LongVectorCount = 0;
+ unsigned MVTLen =
+ LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
+ while (NumVecElts > MVTLen) {
+ NumVecElts /= 2;
+ // Assume the pairwise shuffles add a cost.
+ ShuffleCost += (IsPairwise + 1) *
+ ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
+ NumVecElts, Ty);
+ ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
+ Ty = VectorType::get(ScalarTy, NumVecElts);
+ ++LongVectorCount;
+ }
+ // The minimal length of the vector is limited by the real length of vector
+ // operations performed on the current platform. That's why several final
+ // reduction opertions are perfomed on the vectors with the same
+ // architecture-dependent length.
+ ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
+ ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
+ NumVecElts, Ty);
+ ArithCost += (NumReduxLevels - LongVectorCount) *
+ ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
}
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 45cfa24f283..d1b569d4cd3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4287,7 +4287,8 @@ private:
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
int ScalarReduxCost =
- ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
+ (ReduxWidth - 1) *
+ TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
<< " for reduction that starts with " << *FirstReducedVal
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index 99c4d007380..45e2215cd36 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -33,7 +33,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
%bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
; CHECK-LABEL: reduction_cost_int
-; CHECK: cost of 17 {{.*}} extractelement
+; CHECK: cost of 11 {{.*}} extractelement
; AVX-LABEL: reduction_cost_int
; AVX: cost of 5 {{.*}} extractelement
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
index fbcfc2d6fe9..3c6db2b1050 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -10,10 +10,10 @@
; return sum;
; }
-; Vector cost is 5, Scalar cost is 32
-; CHECK: Adding cost -27 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
-; Vector cost is 17, Scalar cost is 16
-; SSE2: Adding cost 1 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+; Vector cost is 5, Scalar cost is 7
+; CHECK: Adding cost -2 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+; Vector cost is 11, Scalar cost is 7
+; SSE2: Adding cost 4 for reduction that starts with %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
define i32 @test(i32* nocapture readonly %p) {
; CHECK-LABEL: @test(
; CHECK: [[BC:%.*]] = bitcast i32* %p to <8 x i32>*
OpenPOWER on IntegriCloud