diff options
6 files changed, 160 insertions, 0 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0a0af384c3e..6cbe3a1f515 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -396,6 +396,9 @@ public: bool isLegalMaskedScatter(Type *DataType) const; bool isLegalMaskedGather(Type *DataType) const; + /// Return true if target doesn't mind addresses in vectors. + bool prefersVectorizedAddressing() const; + /// \brief Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. @@ -807,6 +810,7 @@ public: virtual bool isLegalMaskedLoad(Type *DataType) = 0; virtual bool isLegalMaskedScatter(Type *DataType) = 0; virtual bool isLegalMaskedGather(Type *DataType) = 0; + virtual bool prefersVectorizedAddressing() = 0; virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) = 0; @@ -1000,6 +1004,9 @@ public: bool isLegalMaskedGather(Type *DataType) override { return Impl.isLegalMaskedGather(DataType); } + bool prefersVectorizedAddressing() override { + return Impl.prefersVectorizedAddressing(); + } int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 550e84ad90c..ad1a7cb748f 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -237,6 +237,8 @@ public: bool isLegalMaskedGather(Type *DataType) { return false; } + bool prefersVectorizedAddressing() { return true; } + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { // Guess that all legal addressing mode are free. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8a5d1047366..7a8d4f3be24 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -149,6 +149,10 @@ bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const { return TTIImpl->isLegalMaskedGather(DataType); } +bool TargetTransformInfo::prefersVectorizedAddressing() const { + return TTIImpl->prefersVectorizedAddressing(); +} + int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 3766ed45b8c..ad597f5c65f 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -55,6 +55,7 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + bool prefersVectorizedAddressing() { return false; } bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fa8b613f934..2b83b8426d1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2092,6 +2092,10 @@ private: /// The data is collected per VF. DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars; + /// Holds the instructions (address computations) that are forced to be + /// scalarized. + DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -5576,6 +5580,13 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n"); } + // Insert the forced scalars. + // FIXME: Currently widenPHIInstruction() often creates a dead vector + // induction variable when the PHI user is scalarized. + if (ForcedScalars.count(VF)) + for (auto *I : ForcedScalars.find(VF)->second) + Worklist.insert(I); + // Expand the worklist by looking through any bitcasts and getelementptr // instructions we've already identified as scalar. This is similar to the // expansion step in collectLoopUniforms(); however, here we're only @@ -7151,9 +7162,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (VF > 1 && isProfitableToScalarize(I, VF)) return VectorizationCostTy(InstsToScalarize[VF][I], false); + // Forced scalars do not have any scalarization overhead. + if (VF > 1 && ForcedScalars.count(VF) && + ForcedScalars.find(VF)->second.count(I)) + return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false); + Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); + // Note: Even if all instructions are scalarized, return true if any memory + // accesses appear in the loop to get benefits from address folding etc. bool TypeNotScalarized = VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF; return VectorizationCostTy(C, TypeNotScalarized); @@ -7230,6 +7248,62 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { setWideningDecision(&I, VF, Decision, Cost); } } + + // Make sure that any load of address and any other address computation + // remains scalar unless there is gather/scatter support. This avoids + // inevitable extracts into address registers, and also has the benefit of + // activating LSR more, since that pass can't optimize vectorized + // addresses. + if (TTI.prefersVectorizedAddressing()) + return; + + // Start with all scalar pointer uses. + SmallPtrSet<Instruction *, 8> AddrDefs; + for (BasicBlock *BB : TheLoop->blocks()) + for (Instruction &I : *BB) { + Instruction *PtrDef = + dyn_cast_or_null<Instruction>(getPointerOperand(&I)); + if (PtrDef && TheLoop->contains(PtrDef) && + getWideningDecision(&I, VF) != CM_GatherScatter) + AddrDefs.insert(PtrDef); + } + + // Add all instructions used to generate the addresses. + SmallVector<Instruction *, 4> Worklist; + for (auto *I : AddrDefs) + Worklist.push_back(I); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (auto &Op : I->operands()) + if (auto *InstOp = dyn_cast<Instruction>(Op)) + if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) && + AddrDefs.insert(InstOp).second == true) + Worklist.push_back(InstOp); + } + + for (auto *I : AddrDefs) { + if (isa<LoadInst>(I)) { + // Setting the desired widening decision should ideally be handled in + // by cost functions, but since this involves the task of finding out + // if the loaded register is involved in an address computation, it is + // instead changed here when we know this is the case. + if (getWideningDecision(I, VF) == CM_Widen) + // Scalarize a widened load of address. + setWideningDecision(I, VF, CM_Scalarize, + (VF * getMemoryInstructionCost(I, 1))); + else if (auto Group = Legal->getInterleavedAccessGroup(I)) { + // Scalarize an interleave group of address loads. + for (unsigned I = 0; I < Group->getFactor(); ++I) { + if (Instruction *Member = Group->getMember(I)) + setWideningDecision(Member, VF, CM_Scalarize, + (VF * getMemoryInstructionCost(Member, 1))); + } + } + } else + // Make sure I gets scalarized and a cost estimate without + // scalarization overhead. + ForcedScalars[VF].insert(I); + } } unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll new file mode 100644 index 00000000000..1f7a6d29c57 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll @@ -0,0 +1,72 @@ +; RUN: opt -S -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -dce \ +; RUN: -instcombine -force-vector-width=2 < %s | FileCheck %s +; +; Test that loop vectorizer does not generate vector addresses that must then +; always be extracted. + +; Check that the addresses for a scalarized memory access is not extracted +; from a vector register. +define i32 @foo(i32* nocapture %A) { +;CHECK-LABEL: @foo( +;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +;CHECK: %0 = shl nsw i64 %index, 2 +;CHECK: %1 = shl i64 %index, 2 +;CHECK: %2 = or i64 %1, 4 +;CHECK: %3 = getelementptr inbounds i32, i32* %A, i64 %0 +;CHECK: %4 = getelementptr inbounds i32, i32* %A, i64 %2 +;CHECK: store i32 4, i32* %3, align 4 +;CHECK: store i32 4, i32* %4, align 4 + +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0 + store i32 4, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret i32 undef +} + + +; Check that a load of address is scalarized. +define i32 @foo1(i32* nocapture noalias %A, i32** nocapture %PtrPtr) { +;CHECK-LABEL: @foo1( +;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +;CHECK: %0 = or i64 %index, 1 +;CHECK: %1 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %index +;CHECK: %2 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %0 +;CHECK: %3 = load i32*, i32** %1, align 8 +;CHECK: %4 = load i32*, i32** %2, align 8 +;CHECK: %5 = load i32, i32* %3, align 4 +;CHECK: %6 = load i32, i32* %4, align 4 +;CHECK: %7 = insertelement <2 x i32> undef, i32 %5, i32 0 +;CHECK: %8 = insertelement <2 x i32> %7, i32 %6, i32 1 +;CHECK: %9 = getelementptr inbounds i32, i32* %A, i64 %index +;CHECK: %10 = bitcast i32* %9 to <2 x i32>* +;CHECK: store <2 x i32> %8, <2 x i32>* %10, align 4 + +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr = getelementptr inbounds i32*, i32** %PtrPtr, i64 %indvars.iv + %el = load i32*, i32** %ptr + %v = load i32, i32* %el + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %v, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret i32 undef +} |

