summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp94
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td8
7 files changed, 128 insertions, 9 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 16815affbf9..c011be6fa16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -15,7 +15,10 @@
#include "AMDGPU.h"
#include "AMDGPUIntrinsicInfo.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/Debug.h"
@@ -30,6 +33,10 @@ namespace {
class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
DivergenceAnalysis *DA;
+ MemoryDependenceResults *MDR;
+ LoopInfo *LI;
+ DenseMap<Value*, GetElementPtrInst*> noClobberClones;
+ bool isKernelFunc;
public:
static char ID;
@@ -42,12 +49,14 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DivergenceAnalysis>();
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.setPreservesAll();
}
void visitBranchInst(BranchInst &I);
void visitLoadInst(LoadInst &I);
-
+ bool isClobberedInFunction(LoadInst * Load);
};
} // End anonymous namespace
@@ -55,6 +64,8 @@ public:
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
@@ -63,6 +74,46 @@ char AMDGPUAnnotateUniformValues::ID = 0;
static void setUniformMetadata(Instruction *I) {
I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
}
+static void setNoClobberMetadata(Instruction *I) {
+ I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
+}
+
+static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) {
+ for (auto I : predecessors(Root))
+ if (Set.insert(I))
+ DFS(I, Set);
+}
+
+bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
+ // 1. get Loop for the Load->getparent();
+ // 2. if it exists, collect all the BBs from the most outer
+ // loop and check for the writes. If NOT - start DFS over all preds.
+ // 3. Start DFS over all preds from the most outer loop header.
+ SetVector<BasicBlock *> Checklist;
+ BasicBlock *Start = Load->getParent();
+ Checklist.insert(Start);
+ const Value *Ptr = Load->getPointerOperand();
+ const Loop *L = LI->getLoopFor(Start);
+ if (L) {
+ const Loop *P = L;
+ do {
+ L = P;
+ P = P->getParentLoop();
+ } while (P);
+ Checklist.insert(L->block_begin(), L->block_end());
+ Start = L->getHeader();
+ }
+
+ DFS(Start, Checklist);
+ for (auto &BB : Checklist) {
+ BasicBlock::iterator StartIt = (BB == Load->getParent()) ?
+ BasicBlock::iterator(Load) : BB->end();
+ if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr),
+ true, StartIt, BB, Load).isClobber())
+ return true;
+ }
+ return false;
+}
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
if (I.isUnconditional())
@@ -79,10 +130,39 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
Value *Ptr = I.getPointerOperand();
if (!DA->isUniform(Ptr))
return;
+ auto isGlobalLoad = [](LoadInst &Load)->bool {
+ return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+ };
+ // We're tracking up to the Function boundaries
+ // We cannot go beyond because of FunctionPass restrictions
+ // Thus we can ensure that memory not clobbered for memory
+ // operations that live in kernel only.
+ bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I);
+ Instruction *PtrI = dyn_cast<Instruction>(Ptr);
+ if (!PtrI && NotClobbered && isGlobalLoad(I)) {
+ if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
+ // Lookup for the existing GEP
+ if (noClobberClones.count(Ptr)) {
+ PtrI = noClobberClones[Ptr];
+ } else {
+ // Create GEP of the Value
+ Function *F = I.getParent()->getParent();
+ Value *Idx = Constant::getIntegerValue(
+ Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
+ // Insert GEP at the entry to make it dominate all uses
+ PtrI = GetElementPtrInst::Create(
+ Ptr->getType()->getPointerElementType(), Ptr,
+ ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI());
+ }
+ I.replaceUsesOfWith(Ptr, PtrI);
+ }
+ }
- if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
+ if (PtrI) {
setUniformMetadata(PtrI);
-
+ if (NotClobbered)
+ setNoClobberMetadata(PtrI);
+ }
}
bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
@@ -93,9 +173,13 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- DA = &getAnalysis<DivergenceAnalysis>();
- visit(F);
+ DA = &getAnalysis<DivergenceAnalysis>();
+ MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+ visit(F);
+ noClobberClones.clear();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d0dd7a94f20..6a0275a1317 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -121,6 +121,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
CFALUBug(false),
HasVertexCache(false),
TexVTXClauseSize(0),
+ ScalarizeGlobal(false),
FeatureDisable(false),
InstrItins(getInstrItineraryForCPU(GPU)),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 842711b0dd3..939d13763df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -114,6 +114,7 @@ protected:
bool CFALUBug;
bool HasVertexCache;
short TexVTXClauseSize;
+ bool ScalarizeGlobal;
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;
@@ -401,6 +402,9 @@ public:
return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
}
+ void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
+ bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+
/// \returns Subtarget's default pair of minimum/maximum flat work group sizes
/// for function \p F, or minimum/maximum flat work group sizes explicitly
/// requested using "amdgpu-flat-work-group-size" attribute attached to
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e1fd95d0917..a62975cde27 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -61,6 +61,14 @@ static cl::opt<bool> EnableLoadStoreVectorizer(
cl::init(true),
cl::Hidden);
+// Option to to control global loads scalarization
+static cl::opt<bool> ScalarizeGlobal(
+ "amdgpu-scalarize-global-loads",
+ cl::desc("Enable global load scalarization"),
+ cl::init(false),
+ cl::Hidden);
+
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -262,6 +270,8 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
I->setGISelAccessor(*GISel);
}
+ I->setScalarizeGlobalBehavior(ScalarizeGlobal);
+
return I.get();
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index da60a0f7bdc..a0184bfefd0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -610,6 +610,13 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
}
+bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
+ const MemSDNode *MemNode = cast<MemSDNode>(N);
+ const Value *Ptr = MemNode->getMemOperand()->getValue();
+ const Instruction *I = dyn_cast<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.noclobber");
+}
+
bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
// Flat -> private/local is a simple truncate.
@@ -2773,11 +2780,19 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (isMemOpUniform(Load))
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
- // have the same legalization requires ments as global and private
+ // have the same legalization requirements as global and private
// loads.
//
LLVM_FALLTHROUGH;
- case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS: {
+ if (isMemOpUniform(Load) && isMemOpHasNoClobberedMemOperand(Load))
+ return SDValue();
+ // Non-uniform loads will be selected to MUBUF instructions, so they
+ // have the same legalization requirements as global and private
+ // loads.
+ //
+ }
+ LLVM_FALLTHROUGH;
case AMDGPUAS::FLAT_ADDRESS:
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 03846fd5473..56d6ef2a0c1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -129,6 +129,7 @@ public:
MachineFunction &MF) const override;
bool isMemOpUniform(const SDNode *N) const;
+ bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 1ae3645cdcb..02656483cd7 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -222,11 +222,15 @@ def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
// Scalar Memory Patterns
//===----------------------------------------------------------------------===//
+
def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
auto Ld = cast<LoadSDNode>(N);
return Ld->getAlignment() >= 4 &&
- Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
- static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
+ ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+ (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
}]>;
def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
OpenPOWER on IntegriCloud