diff options
author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-09-20 04:25:58 +0000 |
---|---|---|
committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-09-20 04:25:58 +0000 |
commit | 5670e6d482b445cdf5c8e1166828b20a02416d10 (patch) | |
tree | 111b9dab27a4dc121d3fbcfbcdbe171776fb98aa /llvm/lib | |
parent | bc683831668e63a8f01f7f442d5ec01f02cf49cb (diff) | |
download | bcm5719-llvm-5670e6d482b445cdf5c8e1166828b20a02416d10.tar.gz bcm5719-llvm-5670e6d482b445cdf5c8e1166828b20a02416d10.zip |
[AMDGPU] Port of HSAIL inliner
Differential Revision: https://reviews.llvm.org/D36849
llvm-svn: 313714
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInline.cpp | 208 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 |
5 files changed, 218 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 23c0df91b44..893e4cb2a15 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -182,6 +182,9 @@ void initializeAMDGPUAAWrapperPassPass(PassRegistry&); void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); +Pass *createAMDGPUFunctionInliningPass(); +void initializeAMDGPUInlinerPass(PassRegistry&); + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp new file mode 100644 index 00000000000..ff9e7b50ed5 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -0,0 +1,208 @@ +//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is AMDGPU specific replacement of the standard inliner. +/// The main purpose is to account for the fact that calls not only expensive +/// on the AMDGPU, but much more expensive if a private memory pointer is +/// passed to a function as an argument. In this situation, we are unable to +/// eliminate private memory in the caller unless inlined and end up with slow +/// and expensive scratch access. Thus, we boost the inline threshold for such +/// functions here. +/// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/IPO/Inliner.h" + +using namespace llvm; + +#define DEBUG_TYPE "inline" + +static cl::opt<int> +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), + cl::desc("Cost of alloca argument")); + +// If the amount of scratch memory to eliminate exceeds our ability to allocate +// it into registers we gain nothing by agressively inlining functions for that +// heuristic. +static cl::opt<unsigned> +ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), + cl::desc("Maximum alloca size to use for inline cost")); + +namespace { + +class AMDGPUInliner : public LegacyInlinerBase { + +public: + AMDGPUInliner() : LegacyInlinerBase(ID) { + initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); + Params = getInlineParams(); + } + + static char ID; // Pass identification, replacement for typeid + + unsigned getInlineThreshold(CallSite CS) const; + + InlineCost getInlineCost(CallSite CS) override; + + bool runOnSCC(CallGraphSCC &SCC) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + TargetTransformInfoWrapperPass *TTIWP; + + InlineParams Params; +}; + +} // end anonymous namespace + +char AMDGPUInliner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) + +Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } + +bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { + TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); + return LegacyInlinerBase::runOnSCC(SCC); +} + +void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetTransformInfoWrapperPass>(); + LegacyInlinerBase::getAnalysisUsage(AU); +} + +unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { + int Thres = Params.DefaultThreshold; + + Function *Caller = CS.getCaller(); + // Listen to the inlinehint attribute when it would increase the threshold + // and the caller does not need to minimize its size. + Function *Callee = CS.getCalledFunction(); + bool InlineHint = Callee && !Callee->isDeclaration() && + Callee->hasFnAttribute(Attribute::InlineHint); + if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres + && !Caller->hasFnAttribute(Attribute::MinSize)) + Thres = Params.HintThreshold.getValue(); + + const DataLayout &DL = Caller->getParent()->getDataLayout(); + if (!Callee) + return (unsigned)Thres; + + const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent()); + + // If we have a pointer to private array passed into a function + // it will not be optimized out, leaving scratch usage. + // Increase the inline threshold to allow inliniting in this case. + uint64_t AllocaSize = 0; + SmallPtrSet<const AllocaInst *, 8> AIVisited; + for (Value *PtrArg : CS.args()) { + Type *Ty = PtrArg->getType(); + if (!Ty->isPointerTy() || + Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) + continue; + PtrArg = GetUnderlyingObject(PtrArg, DL); + if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { + if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) + continue; + AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); + // If the amount of stack memory is excessive we will not be able + // to get rid of the scratch anyway, bail out. + if (AllocaSize > ArgAllocaCutoff) { + AllocaSize = 0; + break; + } + } + } + if (AllocaSize) + Thres += ArgAllocaCost; + + return (unsigned)Thres; +} + +// Check if call is just a wrapper around another call. +// In this case we only have call and ret instructions. +static bool isWrapperOnlyCall(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->size() != 1) + return false; + const BasicBlock &BB = Callee->getEntryBlock(); + if (const Instruction *I = BB.getFirstNonPHI()) { + if (!isa<CallInst>(I)) { + return false; + } + if (isa<ReturnInst>(*std::next(I->getIterator()))) { + DEBUG(dbgs() << " Wrapper only call detected: " + << Callee->getName() << '\n'); + return true; + } + } + return false; +} + +InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); + TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); + + if (!Callee || Callee->isDeclaration() || CS.isNoInline() || + !TTI.areInlineCompatible(Caller, Callee)) + return llvm::InlineCost::getNever(); + + if (CS.hasFnAttr(Attribute::AlwaysInline)) { + if (isInlineViable(*Callee)) + return llvm::InlineCost::getAlways(); + return llvm::InlineCost::getNever(); + } + + if (isWrapperOnlyCall(CS)) + return llvm::InlineCost::getAlways(); + + InlineParams LocalParams = Params; + LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); + bool RemarksEnabled = false; + const auto &BBs = Caller->getBasicBlockList(); + if (!BBs.empty()) { + auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); + if (DI.isEnabled()) + RemarksEnabled = true; + } + + OptimizationRemarkEmitter ORE(Caller); + std::function<AssumptionCache &(Function &)> GetAssumptionCache = + [this](Function &F) -> AssumptionCache & { + return ACT->getAssumptionCache(F); + }; + + return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, + None, PSI, RemarksEnabled ? &ORE : nullptr); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 284f44a6232..212adbce68b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -179,6 +179,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); + initializeAMDGPUInlinerPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -332,10 +333,12 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { bool EnableOpt = getOptLevel() > CodeGenOpt::None; bool Internalize = InternalizeSymbols; - bool EarlyInline = EarlyInlineAll && EnableOpt; + bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls; bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; + Builder.Inliner = createAMDGPUFunctionInliningPass(); + if (Internalize) { // If we're generating code, we always have the whole program available. The // relocations expected for externally visible functions aren't supported, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index a5225f68af9..ee0683d39b4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -162,6 +162,8 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const; + + unsigned getInliningThresholdMultiplier() { return 9; } }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 6bae4f2a767..450835f414a 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -51,6 +51,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetTransformInfo.cpp AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp + AMDGPUInline.cpp AMDILCFGStructurizer.cpp GCNHazardRecognizer.cpp GCNIterativeScheduler.cpp |