diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp | 94 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 19 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SMInstructions.td | 8 |
7 files changed, 128 insertions, 9 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index 16815affbf9..c011be6fa16 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -15,7 +15,10 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/Debug.h" @@ -30,6 +33,10 @@ namespace { class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor<AMDGPUAnnotateUniformValues> { DivergenceAnalysis *DA; + MemoryDependenceResults *MDR; + LoopInfo *LI; + DenseMap<Value*, GetElementPtrInst*> noClobberClones; + bool isKernelFunc; public: static char ID; @@ -42,12 +49,14 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DivergenceAnalysis>(); + AU.addRequired<MemoryDependenceWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.setPreservesAll(); } void visitBranchInst(BranchInst &I); void visitLoadInst(LoadInst &I); - + bool isClobberedInFunction(LoadInst * Load); }; } // End anonymous namespace @@ -55,6 +64,8 @@ public: INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) @@ -63,6 +74,46 @@ char AMDGPUAnnotateUniformValues::ID = 0; static void setUniformMetadata(Instruction *I) { I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {})); } +static void setNoClobberMetadata(Instruction *I) { + I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {})); +} + +static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) { + for (auto I : predecessors(Root)) + if (Set.insert(I)) + DFS(I, Set); +} + +bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { + // 1. get Loop for the Load->getparent(); + // 2. if it exists, collect all the BBs from the most outer + // loop and check for the writes. If NOT - start DFS over all preds. + // 3. Start DFS over all preds from the most outer loop header. + SetVector<BasicBlock *> Checklist; + BasicBlock *Start = Load->getParent(); + Checklist.insert(Start); + const Value *Ptr = Load->getPointerOperand(); + const Loop *L = LI->getLoopFor(Start); + if (L) { + const Loop *P = L; + do { + L = P; + P = P->getParentLoop(); + } while (P); + Checklist.insert(L->block_begin(), L->block_end()); + Start = L->getHeader(); + } + + DFS(Start, Checklist); + for (auto &BB : Checklist) { + BasicBlock::iterator StartIt = (BB == Load->getParent()) ? + BasicBlock::iterator(Load) : BB->end(); + if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr), + true, StartIt, BB, Load).isClobber()) + return true; + } + return false; +} void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { if (I.isUnconditional()) @@ -79,10 +130,39 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; + auto isGlobalLoad = [](LoadInst &Load)->bool { + return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; + }; + // We're tracking up to the Function boundaries + // We cannot go beyond because of FunctionPass restrictions + // Thus we can ensure that memory not clobbered for memory + // operations that live in kernel only. + bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I); + Instruction *PtrI = dyn_cast<Instruction>(Ptr); + if (!PtrI && NotClobbered && isGlobalLoad(I)) { + if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) { + // Lookup for the existing GEP + if (noClobberClones.count(Ptr)) { + PtrI = noClobberClones[Ptr]; + } else { + // Create GEP of the Value + Function *F = I.getParent()->getParent(); + Value *Idx = Constant::getIntegerValue( + Type::getInt32Ty(Ptr->getContext()), APInt(64, 0)); + // Insert GEP at the entry to make it dominate all uses + PtrI = GetElementPtrInst::Create( + Ptr->getType()->getPointerElementType(), Ptr, + ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI()); + } + I.replaceUsesOfWith(Ptr, PtrI); + } + } - if (Instruction *PtrI = dyn_cast<Instruction>(Ptr)) + if (PtrI) { setUniformMetadata(PtrI); - + if (NotClobbered) + setNoClobberMetadata(PtrI); + } } bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { @@ -93,9 +173,13 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { if (skipFunction(F)) return false; - DA = &getAnalysis<DivergenceAnalysis>(); - visit(F); + DA = &getAnalysis<DivergenceAnalysis>(); + MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; + visit(F); + noClobberClones.clear(); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index d0dd7a94f20..6a0275a1317 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -121,6 +121,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, CFALUBug(false), HasVertexCache(false), TexVTXClauseSize(0), + ScalarizeGlobal(false), FeatureDisable(false), InstrItins(getInstrItineraryForCPU(GPU)), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 842711b0dd3..939d13763df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -114,6 +114,7 @@ protected: bool CFALUBug; bool HasVertexCache; short TexVTXClauseSize; + bool ScalarizeGlobal; // Dummy feature to use for assembler in tablegen. bool FeatureDisable; @@ -401,6 +402,9 @@ public: return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize(); } + void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} + bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} + /// \returns Subtarget's default pair of minimum/maximum flat work group sizes /// for function \p F, or minimum/maximum flat work group sizes explicitly /// requested using "amdgpu-flat-work-group-size" attribute attached to diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e1fd95d0917..a62975cde27 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -61,6 +61,14 @@ static cl::opt<bool> EnableLoadStoreVectorizer( cl::init(true), cl::Hidden); +// Option to to control global loads scalarization +static cl::opt<bool> ScalarizeGlobal( + "amdgpu-scalarize-global-loads", + cl::desc("Enable global load scalarization"), + cl::init(false), + cl::Hidden); + + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); @@ -262,6 +270,8 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { I->setGISelAccessor(*GISel); } + I->setScalarizeGlobalBehavior(ScalarizeGlobal); + return I.get(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index da60a0f7bdc..a0184bfefd0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -610,6 +610,13 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); } +bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { + const MemSDNode *MemNode = cast<MemSDNode>(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + const Instruction *I = dyn_cast<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.noclobber"); +} + bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { // Flat -> private/local is a simple truncate. @@ -2773,11 +2780,19 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (isMemOpUniform(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they - // have the same legalization requires ments as global and private + // have the same legalization requirements as global and private // loads. // LLVM_FALLTHROUGH; - case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: { + if (isMemOpUniform(Load) && isMemOpHasNoClobberedMemOperand(Load)) + return SDValue(); + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requirements as global and private + // loads. + // + } + LLVM_FALLTHROUGH; case AMDGPUAS::FLAT_ADDRESS: if (NumElements > 4) return SplitVectorLoad(Op, DAG); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 03846fd5473..56d6ef2a0c1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -129,6 +129,7 @@ public: MachineFunction &MF) const override; bool isMemOpUniform(const SDNode *N) const; + bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 1ae3645cdcb..02656483cd7 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -222,11 +222,15 @@ def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime> // Scalar Memory Patterns //===----------------------------------------------------------------------===// + def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast<LoadSDNode>(N); return Ld->getAlignment() >= 4 && - Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); + ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) || + (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; |