diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp | 47 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 24 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 60 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 |
6 files changed, 118 insertions, 24 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 6725fb37cab..8f6e1e7d884 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -55,7 +55,7 @@ ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nul void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; -ModulePass *createAMDGPULowerIntrinsicsPass(); +ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index 5721ea41e3b..dcb6670621e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -23,10 +24,16 @@ namespace { const unsigned MaxStaticSize = 1024; class AMDGPULowerIntrinsics : public ModulePass { +private: + const TargetMachine *TM; + + bool makeLIDRangeMetadata(Function &F) const; + public: static char ID; - AMDGPULowerIntrinsics() : ModulePass(ID) { } + AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr) + : ModulePass(ID), TM(TM) { } bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Lower Intrinsics"; @@ -39,8 +46,8 @@ char AMDGPULowerIntrinsics::ID = 0; char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID; -INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, - "Lower intrinsics", false, false) +INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, + "Lower intrinsics", false, false) // TODO: Should refine based on estimated number of accesses (e.g. does it // require splitting based on alignment) @@ -96,6 +103,23 @@ static bool expandMemIntrinsicUses(Function &F) { return Changed; } +bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const { + if (!TM) + return false; + + bool Changed = false; + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + + for (auto *U : F.users()) { + auto *CI = dyn_cast<CallInst>(U); + if (!CI) + continue; + + Changed |= ST.makeLIDRangeMetadata(CI); + } + return Changed; +} + bool AMDGPULowerIntrinsics::runOnModule(Module &M) { bool Changed = false; @@ -110,6 +134,19 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) { if (expandMemIntrinsicUses(F)) Changed = true; break; + + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + case Intrinsic::r600_read_local_size_x: + case Intrinsic::r600_read_local_size_y: + case Intrinsic::r600_read_local_size_z: + Changed |= makeLIDRangeMetadata(F); + break; + default: break; } @@ -118,6 +155,6 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) { return Changed; } -ModulePass *llvm::createAMDGPULowerIntrinsicsPass() { - return new AMDGPULowerIntrinsics(); +ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) { + return new AMDGPULowerIntrinsics(TM); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 96bc53d06cd..4fb262c6277 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -38,7 +38,6 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -71,7 +70,6 @@ private: const TargetMachine *TM; Module *Mod = nullptr; const DataLayout *DL = nullptr; - MDNode *MaxWorkGroupSizeRange = nullptr; AMDGPUAS AS; // FIXME: This should be per-kernel. @@ -133,13 +131,6 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) { Mod = &M; DL = &Mod->getDataLayout(); - // The maximum workitem id. - // - // FIXME: Should get as subtarget property. Usually runtime enforced max is - // 256. - MDBuilder MDB(Mod->getContext()); - MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048)); - const Triple &TT = TM->getTargetTriple(); IsAMDGCN = TT.getArch() == Triple::amdgcn; @@ -258,6 +249,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { std::pair<Value *, Value *> AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>( + *Builder.GetInsertBlock()->getParent()); + if (!IsAMDHSA) { Function *LocalSizeYFn = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); @@ -267,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); - LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); - LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(LocalSizeY); + ST.makeLIDRangeMetadata(LocalSizeZ); return std::make_pair(LocalSizeY, LocalSizeZ); } @@ -333,7 +327,7 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { MDNode *MD = MDNode::get(Mod->getContext(), None); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); - LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(LoadZU); // Extract y component. Upper half of LoadZU should be zero already. Value *Y = Builder.CreateLShr(LoadXY, 16); @@ -342,6 +336,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { } Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>( + *Builder.GetInsertBlock()->getParent()); Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic; switch (N) { @@ -364,7 +360,7 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); CallInst *CI = Builder.CreateCall(WorkitemIdFn); - CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(CI); return CI; } @@ -690,8 +686,6 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); - // FIXME: We should also try to get this value from the reqd_work_group_size - // function attribute if it is available. unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; const DataLayout &DL = Mod->getDataLayout(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 695d51a5353..972c28579f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -16,6 +16,7 @@ #include "SIMachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetFrameLowering.h" #include <algorithm> @@ -240,6 +241,65 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( return Requested; } +bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { + Function *Kernel = I->getParent()->getParent(); + unsigned MinSize = 0; + unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; + bool IdQuery = false; + + // If reqd_work_group_size is present it narrows value down. + if (auto *CI = dyn_cast<CallInst>(I)) { + const Function *F = CI->getCalledFunction(); + if (F) { + unsigned Dim = UINT_MAX; + switch (F->getIntrinsicID()) { + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::r600_read_tidig_x: + IdQuery = true; + case Intrinsic::r600_read_local_size_x: + Dim = 0; + break; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + IdQuery = true; + case Intrinsic::r600_read_local_size_y: + Dim = 1; + break; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + IdQuery = true; + case Intrinsic::r600_read_local_size_z: + Dim = 2; + break; + default: + break; + } + if (Dim <= 3) { + if (auto Node = Kernel->getMetadata("reqd_work_group_size")) + if (Node->getNumOperands() == 3) + MinSize = MaxSize = mdconst::extract<ConstantInt>( + Node->getOperand(Dim))->getZExtValue(); + } + } + } + + if (!MaxSize) + return false; + + // Range metadata is [Lo, Hi). For ID query we need to pass max size + // as Hi. For size query we need to pass Hi + 1. + if (IdQuery) + MinSize = 0; + else + ++MaxSize; + + MDBuilder MDB(I->getContext()); + MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), + APInt(32, MaxSize)); + I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + return true; +} + R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : AMDGPUSubtarget(TT, GPU, FS, TM), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index c61a2ff818f..36bc2498781 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -512,6 +512,9 @@ public: /// compatible with minimum/maximum number of waves limited by flat work group /// size, register usage, and/or lds usage. std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; + + /// Creates value range metadata on an workitemid.* inrinsic call or load. + bool makeLIDRangeMetadata(Instruction *I) const; }; class R600Subtarget final : public AMDGPUSubtarget { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1e7ef584d6e..0202220b801 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -555,12 +555,14 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { } void AMDGPUPassConfig::addIRPasses() { + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + // There is no reason to run these. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); - addPass(createAMDGPULowerIntrinsicsPass()); + addPass(createAMDGPULowerIntrinsicsPass(&TM)); // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); @@ -572,8 +574,6 @@ void AMDGPUPassConfig::addIRPasses() { // without ever running any passes on the second. addPass(createBarrierNoopPass()); - const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); - if (TM.getTargetTriple().getArch() == Triple::amdgcn) { // TODO: May want to move later or split into an early and late one. |