summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorZi Xuan Wu <wuzish@cn.ibm.com>2019-10-08 03:28:33 +0000
committerZi Xuan Wu <wuzish@cn.ibm.com>2019-10-08 03:28:33 +0000
commit9f41deccc0e648a006c9f38e11919f181b6c7e0a (patch)
tree0da247efc8203566946a70869472d3f9438b9251 /llvm/lib
parent9806a1d5f90a21a16c4bfc6d4bb10e0d5b870573 (diff)
downloadbcm5719-llvm-9f41deccc0e648a006c9f38e11919f181b6c7e0a.tar.gz
bcm5719-llvm-9f41deccc0e648a006c9f38e11919f181b6c7e0a.zip
[LoopVectorize][PowerPC] Estimate int and float register pressure separately in loop-vectorize
In loop-vectorize, interleave count and vector factor depend on target register number. Currently, it does not estimate different register pressure for different register class separately(especially for scalar type, float type should not be on the same position with int type), so it's not accurate. Specifically, it causes too many times interleaving/unrolling, result in too many register spills in loop body and hurting performance. So we need classify the register classes in IR level, and importantly these are abstract register classes, and are not the target register class of backend provided in td file. It's used to establish the mapping between the types of IR values and the number of simultaneous live ranges to which we'd like to limit for some set of those types. For example, POWER target, register num is special when VSX is enabled. When VSX is enabled, the number of int scalar register is 32(GPR), float is 64(VSR), but for int and float vector register both are 64(VSR). So there should be 2 kinds of register class when vsx is enabled, and 3 kinds of register class when VSX is NOT enabled. It runs on POWER target, it makes big(+~30%) performance improvement in one specific bmk(503.bwaves_r) of spec2017 and no other obvious degressions. Differential revision: https://reviews.llvm.org/D67148 llvm-svn: 374017
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp12
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp35
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h8
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/XCore/XCoreTargetTransformInfo.h3
-rw-r--r--llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp4
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp151
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp2
15 files changed, 172 insertions, 66 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index f3d20ce984d..aa93e1d034f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -466,8 +466,16 @@ int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
return Cost;
}
-unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
- return TTIImpl->getNumberOfRegisters(Vector);
+unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const {
+ return TTIImpl->getNumberOfRegisters(ClassID);
+}
+
+unsigned TargetTransformInfo::getRegisterClassForType(bool Vector, Type *Ty) const {
+ return TTIImpl->getRegisterClassForType(Vector, Ty);
+}
+
+const char* TargetTransformInfo::getRegisterClassName(unsigned ClassID) const {
+ return TTIImpl->getRegisterClassName(ClassID);
}
unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 95cda63b017..d5ef0e5bea3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -85,7 +85,8 @@ public:
bool enableInterleavedAccessVectorization() { return true; }
- unsigned getNumberOfRegisters(bool Vector) {
+ unsigned getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector) {
if (ST->hasNEON())
return 32;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 47e98dac9f6..b878ea3a171 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -122,7 +122,8 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector) {
+ unsigned getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector) {
if (ST->hasNEON())
return 16;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 40e53668701..764335128d4 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -594,10 +594,37 @@ bool PPCTTIImpl::enableInterleavedAccessVectorization() {
return true;
}
-unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
- if (Vector && !ST->hasAltivec() && !ST->hasQPX())
- return 0;
- return ST->hasVSX() ? 64 : 32;
+unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ assert(ClassID == GPRRC || ClassID == FPRRC ||
+ ClassID == VRRC || ClassID == VSXRC);
+ if (ST->hasVSX()) {
+ assert(ClassID == GPRRC || ClassID == VSXRC);
+ return ClassID == GPRRC ? 32 : 64;
+ }
+ assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
+ return 32;
+}
+
+unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
+ if (Vector)
+ return ST->hasVSX() ? VSXRC : VRRC;
+ else if (Ty && Ty->getScalarType()->isFloatTy())
+ return ST->hasVSX() ? VSXRC : FPRRC;
+ else
+ return GPRRC;
+}
+
+const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
+
+ switch (ClassID) {
+ default:
+ llvm_unreachable("unknown register class");
+ return "PPC::unknown register class";
+ case GPRRC: return "PPC::GPRRC";
+ case FPRRC: return "PPC::FPRRC";
+ case VRRC: return "PPC::VRRC";
+ case VSXRC: return "PPC::VSXRC";
+ }
}
unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 5d76ee418b6..294c970f21f 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -72,7 +72,13 @@ public:
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
- unsigned getNumberOfRegisters(bool Vector);
+
+ enum PPCRegisterClass {
+ GPRRC, FPRRC, VRRC, VSXRC
+ };
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
+ unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const;
+ const char* getRegisterClassName(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getCacheLineSize();
unsigned getPrefetchDistance();
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 8d45e67d73c..11c99aa1117 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -304,7 +304,8 @@ bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
C2.ScaleCost, C2.SetupCost);
}
-unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (!Vector)
// Discount the stack pointer. Also leave out %r0, since it can't
// be used in an address.
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 16ce2ef1d7a..e59badeb944 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -56,7 +56,7 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector);
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getCacheLineSize() { return 256; }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 46ef765ce0f..1c53e90daea 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -25,10 +25,11 @@ WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
return TargetTransformInfo::PSK_FastHardware;
}
-unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
- unsigned Result = BaseT::getNumberOfRegisters(Vector);
+unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ unsigned Result = BaseT::getNumberOfRegisters(ClassID);
// For SIMD, use at least 16 registers, as a rough guess.
+ bool Vector = (ClassID == 1);
if (Vector)
Result = std::max(Result, 16u);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 1b11b4b631e..f0ecc73e91d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -53,7 +53,7 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector);
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b634da1d51f..2f419b78f83 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -116,7 +116,8 @@ llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
}
-unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
+unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector && !ST->hasSSE1())
return 0;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 9b948dbbb4c..3ff1896f505 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -116,7 +116,7 @@ public:
/// \name Vector TTI Implementations
/// @{
- unsigned getNumberOfRegisters(bool Vector);
+ unsigned getNumberOfRegisters(unsigned ClassID) const;
unsigned getRegisterBitWidth(bool Vector) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
unsigned getMaxInterleaveFactor(unsigned VF);
diff --git a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
index 3fecaaa5972..58df1f290ec 100644
--- a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -40,7 +40,8 @@ public:
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()),
TLI(ST->getTargetLowering()) {}
- unsigned getNumberOfRegisters(bool Vector) {
+ unsigned getNumberOfRegisters(unsigned ClassID) const {
+ bool Vector = (ClassID == 1);
if (Vector) {
return 0;
}
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 852bbefaf20..7f119175c4a 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1386,7 +1386,9 @@ void Cost::RateFormula(const Formula &F,
// Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
// additional instruction (at least fill).
- unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1;
+ // TODO: Need distinguish register class?
+ unsigned TTIRegNum = TTI->getNumberOfRegisters(
+ TTI->getRegisterClassForType(false, F.getType())) - 1;
if (C.NumRegs > TTIRegNum) {
// Cost already exceeded TTIRegNum, then only newly added register can add
// new instructions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7e95038a5eb..11e1cd003b4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -983,10 +983,11 @@ public:
/// of a loop.
struct RegisterUsage {
/// Holds the number of loop invariant values that are used in the loop.
- unsigned LoopInvariantRegs;
-
+ /// The key is ClassID of target-provided register class.
+ SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
/// Holds the maximum number of concurrent live intervals in the loop.
- unsigned MaxLocalUsers;
+ /// The key is ClassID of target-provided register class.
+ SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
};
/// \return Returns information about the register usages of the loop for the
@@ -4962,9 +4963,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
// Select the largest VF which doesn't require more registers than existing
// ones.
- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
for (int i = RUs.size() - 1; i >= 0; --i) {
- if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
+ bool Selected = true;
+ for (auto& pair : RUs[i].MaxLocalUsers) {
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+ if (pair.second > TargetNumRegisters)
+ Selected = false;
+ }
+ if (Selected) {
MaxVF = VFs[i];
break;
}
@@ -5115,22 +5121,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
return 1;
- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1);
- LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
- << " registers\n");
-
- if (VF == 1) {
- if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
- TargetNumRegisters = ForceTargetNumScalarRegs;
- } else {
- if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
- TargetNumRegisters = ForceTargetNumVectorRegs;
- }
-
RegisterUsage R = calculateRegisterUsage({VF})[0];
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
- R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+ for (auto& pair : R.MaxLocalUsers) {
+ pair.second = std::max(pair.second, 1U);
+ }
// We calculate the interleave count using the following formula.
// Subtract the number of loop invariants from the number of available
@@ -5143,13 +5139,35 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
// We also want power of two interleave counts to ensure that the induction
// variable of the vector loop wraps to zero, when tail is folded by masking;
// this currently happens when OptForSize, in which case IC is set to 1 above.
- unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
- R.MaxLocalUsers);
+ unsigned IC = UINT_MAX;
- // Don't count the induction variable as interleaved.
- if (EnableIndVarRegisterHeur)
- IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) /
- std::max(1U, (R.MaxLocalUsers - 1)));
+ for (auto& pair : R.MaxLocalUsers) {
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+ << " registers of "
+ << TTI.getRegisterClassName(pair.first) << " register class\n");
+ if (VF == 1) {
+ if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumScalarRegs;
+ } else {
+ if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumVectorRegs;
+ }
+ unsigned MaxLocalUsers = pair.second;
+ unsigned LoopInvariantRegs = 0;
+ if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
+ LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
+
+ unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+ // Don't count the induction variable as interleaved.
+ if (EnableIndVarRegisterHeur) {
+ TmpIC =
+ PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+ std::max(1U, (MaxLocalUsers - 1)));
+ }
+
+ IC = std::min(IC, TmpIC);
+ }
// Clamp the interleave ranges to reasonable counts.
unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
@@ -5331,7 +5349,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
const DataLayout &DL = TheFunction->getParent()->getDataLayout();
SmallVector<RegisterUsage, 8> RUs(VFs.size());
- SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0);
+ SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
@@ -5361,21 +5379,45 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
// For each VF find the maximum usage of registers.
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+ // Count the number of live intervals.
+ SmallMapVector<unsigned, unsigned, 4> RegUsage;
+
if (VFs[j] == 1) {
- MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
- continue;
+ for (auto Inst : OpenIntervals) {
+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = 1;
+ else
+ RegUsage[ClassID] += 1;
+ }
+ } else {
+ collectUniformsAndScalars(VFs[j]);
+ for (auto Inst : OpenIntervals) {
+ // Skip ignored values for VF > 1.
+ if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
+ continue;
+ if (isScalarAfterVectorization(Inst, VFs[j])) {
+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = 1;
+ else
+ RegUsage[ClassID] += 1;
+ } else {
+ unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
+ else
+ RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+ }
+ }
}
- collectUniformsAndScalars(VFs[j]);
- // Count the number of live intervals.
- unsigned RegUsage = 0;
- for (auto Inst : OpenIntervals) {
- // Skip ignored values for VF > 1.
- if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() ||
- isScalarAfterVectorization(Inst, VFs[j]))
- continue;
- RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+
+ for (auto& pair : RegUsage) {
+ if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
+ MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
+ else
+ MaxUsages[j][pair.first] = pair.second;
}
- MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
}
LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
@@ -5386,18 +5428,32 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
}
for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
- unsigned Invariant = 0;
- if (VFs[i] == 1)
- Invariant = LoopInvariants.size();
- else {
- for (auto Inst : LoopInvariants)
- Invariant += GetRegUsage(Inst->getType(), VFs[i]);
+ SmallMapVector<unsigned, unsigned, 4> Invariant;
+
+ for (auto Inst : LoopInvariants) {
+ unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+ unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
+ if (Invariant.find(ClassID) == Invariant.end())
+ Invariant[ClassID] = Usage;
+ else
+ Invariant[ClassID] += Usage;
}
LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n');
- LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n');
- LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant
- << '\n');
+ LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: "
+ << MaxUsages[i].size() << " item\n");
+ for (const auto& pair : MaxUsages[i]) {
+ LLVM_DEBUG(dbgs() << "LV(REG): RegisterClass: "
+ << TTI.getRegisterClassName(pair.first)
+ << ", " << pair.second << " registers \n");
+ }
+ LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: "
+ << Invariant.size() << " item\n");
+ for (const auto& pair : Invariant) {
+ LLVM_DEBUG(dbgs() << "LV(REG): RegisterClass: "
+ << TTI.getRegisterClassName(pair.first)
+ << ", " << pair.second << " registers \n");
+ }
RU.LoopInvariantRegs = Invariant;
RU.MaxLocalUsers = MaxUsages[i];
@@ -7762,7 +7818,8 @@ bool LoopVectorizePass::runImpl(
// The second condition is necessary because, even if the target has no
// vector registers, loop vectorization may still enable scalar
// interleaving.
- if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+ TTI->getMaxInterleaveFactor(1) < 2)
return false;
bool Changed = false;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 99428c6c5de..a22153bbed1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5237,7 +5237,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// If the target claims to have no vector registers don't attempt
// vectorization.
- if (!TTI->getNumberOfRegisters(true))
+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
return false;
// Don't vectorize when the attribute NoImplicitFloat is used.
OpenPOWER on IntegriCloud