summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp47
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp60
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp6
6 files changed, 118 insertions, 24 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 6725fb37cab..8f6e1e7d884 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -55,7 +55,7 @@ ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nul
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
-ModulePass *createAMDGPULowerIntrinsicsPass();
+ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr);
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 5721ea41e3b..dcb6670621e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -8,6 +8,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -23,10 +24,16 @@ namespace {
const unsigned MaxStaticSize = 1024;
class AMDGPULowerIntrinsics : public ModulePass {
+private:
+ const TargetMachine *TM;
+
+ bool makeLIDRangeMetadata(Function &F) const;
+
public:
static char ID;
- AMDGPULowerIntrinsics() : ModulePass(ID) { }
+ AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr)
+ : ModulePass(ID), TM(TM) { }
bool runOnModule(Module &M) override;
StringRef getPassName() const override {
return "AMDGPU Lower Intrinsics";
@@ -39,8 +46,8 @@ char AMDGPULowerIntrinsics::ID = 0;
char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
-INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
- "Lower intrinsics", false, false)
+INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
+ "Lower intrinsics", false, false)
// TODO: Should refine based on estimated number of accesses (e.g. does it
// require splitting based on alignment)
@@ -96,6 +103,23 @@ static bool expandMemIntrinsicUses(Function &F) {
return Changed;
}
+bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
+ if (!TM)
+ return false;
+
+ bool Changed = false;
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+
+ for (auto *U : F.users()) {
+ auto *CI = dyn_cast<CallInst>(U);
+ if (!CI)
+ continue;
+
+ Changed |= ST.makeLIDRangeMetadata(CI);
+ }
+ return Changed;
+}
+
bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
bool Changed = false;
@@ -110,6 +134,19 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
if (expandMemIntrinsicUses(F))
Changed = true;
break;
+
+ case Intrinsic::amdgcn_workitem_id_x:
+ case Intrinsic::r600_read_tidig_x:
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::r600_read_tidig_y:
+ case Intrinsic::amdgcn_workitem_id_z:
+ case Intrinsic::r600_read_tidig_z:
+ case Intrinsic::r600_read_local_size_x:
+ case Intrinsic::r600_read_local_size_y:
+ case Intrinsic::r600_read_local_size_z:
+ Changed |= makeLIDRangeMetadata(F);
+ break;
+
default:
break;
}
@@ -118,6 +155,6 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
return Changed;
}
-ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
- return new AMDGPULowerIntrinsics();
+ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) {
+ return new AMDGPULowerIntrinsics(TM);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 96bc53d06cd..4fb262c6277 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -38,7 +38,6 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
@@ -71,7 +70,6 @@ private:
const TargetMachine *TM;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
- MDNode *MaxWorkGroupSizeRange = nullptr;
AMDGPUAS AS;
// FIXME: This should be per-kernel.
@@ -133,13 +131,6 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Mod = &M;
DL = &Mod->getDataLayout();
- // The maximum workitem id.
- //
- // FIXME: Should get as subtarget property. Usually runtime enforced max is
- // 256.
- MDBuilder MDB(Mod->getContext());
- MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
-
const Triple &TT = TM->getTargetTriple();
IsAMDGCN = TT.getArch() == Triple::amdgcn;
@@ -258,6 +249,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
std::pair<Value *, Value *>
AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
+ *Builder.GetInsertBlock()->getParent());
+
if (!IsAMDHSA) {
Function *LocalSizeYFn
= Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
@@ -267,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
- LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
- LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+ ST.makeLIDRangeMetadata(LocalSizeY);
+ ST.makeLIDRangeMetadata(LocalSizeZ);
return std::make_pair(LocalSizeY, LocalSizeZ);
}
@@ -333,7 +327,7 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
MDNode *MD = MDNode::get(Mod->getContext(), None);
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
- LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+ ST.makeLIDRangeMetadata(LoadZU);
// Extract y component. Upper half of LoadZU should be zero already.
Value *Y = Builder.CreateLShr(LoadXY, 16);
@@ -342,6 +336,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
}
Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
+ *Builder.GetInsertBlock()->getParent());
Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
switch (N) {
@@ -364,7 +360,7 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
CallInst *CI = Builder.CreateCall(WorkitemIdFn);
- CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+ ST.makeLIDRangeMetadata(CI);
return CI;
}
@@ -690,8 +686,6 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
const AMDGPUSubtarget &ST =
TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
- // FIXME: We should also try to get this value from the reqd_work_group_size
- // function attribute if it is available.
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
const DataLayout &DL = Mod->getDataLayout();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 695d51a5353..972c28579f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -16,6 +16,7 @@
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetFrameLowering.h"
#include <algorithm>
@@ -240,6 +241,65 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
return Requested;
}
+bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
+ Function *Kernel = I->getParent()->getParent();
+ unsigned MinSize = 0;
+ unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
+ bool IdQuery = false;
+
+ // If reqd_work_group_size is present it narrows value down.
+ if (auto *CI = dyn_cast<CallInst>(I)) {
+ const Function *F = CI->getCalledFunction();
+ if (F) {
+ unsigned Dim = UINT_MAX;
+ switch (F->getIntrinsicID()) {
+ case Intrinsic::amdgcn_workitem_id_x:
+ case Intrinsic::r600_read_tidig_x:
+ IdQuery = true;
+ case Intrinsic::r600_read_local_size_x:
+ Dim = 0;
+ break;
+ case Intrinsic::amdgcn_workitem_id_y:
+ case Intrinsic::r600_read_tidig_y:
+ IdQuery = true;
+ case Intrinsic::r600_read_local_size_y:
+ Dim = 1;
+ break;
+ case Intrinsic::amdgcn_workitem_id_z:
+ case Intrinsic::r600_read_tidig_z:
+ IdQuery = true;
+ case Intrinsic::r600_read_local_size_z:
+ Dim = 2;
+ break;
+ default:
+ break;
+ }
+ if (Dim <= 3) {
+ if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
+ if (Node->getNumOperands() == 3)
+ MinSize = MaxSize = mdconst::extract<ConstantInt>(
+ Node->getOperand(Dim))->getZExtValue();
+ }
+ }
+ }
+
+ if (!MaxSize)
+ return false;
+
+ // Range metadata is [Lo, Hi). For ID query we need to pass max size
+ // as Hi. For size query we need to pass Hi + 1.
+ if (IdQuery)
+ MinSize = 0;
+ else
+ ++MaxSize;
+
+ MDBuilder MDB(I->getContext());
+ MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
+ APInt(32, MaxSize));
+ I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+ return true;
+}
+
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM) :
AMDGPUSubtarget(TT, GPU, FS, TM),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index c61a2ff818f..36bc2498781 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -512,6 +512,9 @@ public:
/// compatible with minimum/maximum number of waves limited by flat work group
/// size, register usage, and/or lds usage.
std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+
+ /// Creates value range metadata on an workitemid.* inrinsic call or load.
+ bool makeLIDRangeMetadata(Instruction *I) const;
};
class R600Subtarget final : public AMDGPUSubtarget {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 1e7ef584d6e..0202220b801 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -555,12 +555,14 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
}
void AMDGPUPassConfig::addIRPasses() {
+ const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+
// There is no reason to run these.
disablePass(&StackMapLivenessID);
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
- addPass(createAMDGPULowerIntrinsicsPass());
+ addPass(createAMDGPULowerIntrinsicsPass(&TM));
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
@@ -572,8 +574,6 @@ void AMDGPUPassConfig::addIRPasses() {
// without ever running any passes on the second.
addPass(createBarrierNoopPass());
- const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
-
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
// TODO: May want to move later or split into an early and late one.
OpenPOWER on IntegriCloud