summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp36
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h6
6 files changed, 60 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index bd47eadcafc..faa9a41c96a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -346,6 +346,13 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
+def FeatureEnableHugePrivateBuffer : SubtargetFeature<
+ "huge-private-buffer",
+ "EnableHugePrivateBuffer",
+ "true",
+ "Enable private/scratch buffer sizes greater than 128 GB"
+>;
+
def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
"EnableVGPRSpilling",
"true",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 86a0dab30ea..c9cefe3d2da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1160,8 +1160,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
SDValue N1 = Addr.getOperand(1);
// Offsets in vaddr must be positive.
+ //
+ // The total computation of vaddr + soffset + offset must not overflow.
+ // If vaddr is negative, even if offset is 0 the sgpr offset add will end up
+ // overflowing.
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
- if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
+ if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
+ CurDAG->SignBitIsZero(N0)) {
std::tie(VAddr, SOffset) = foldFrameIndex(N0);
ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 83122281d2b..8e5a432e068 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -121,6 +121,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
DebuggerReserveRegs(false),
DebuggerEmitPrologue(false),
+ EnableHugePrivateBuffer(false),
EnableVGPRSpilling(false),
EnablePromoteAlloca(false),
EnableLoadStoreOpt(false),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 460ff82efc5..f9b400cfe1b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -130,6 +130,7 @@ protected:
bool DebuggerEmitPrologue;
// Used as options.
+ bool EnableHugePrivateBuffer;
bool EnableVGPRSpilling;
bool EnablePromoteAlloca;
bool EnableLoadStoreOpt;
@@ -351,6 +352,10 @@ public:
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
}
+ bool enableHugePrivateBuffer() const {
+ return EnableHugePrivateBuffer;
+ }
+
bool isPromoteAllocaEnabled() const {
return EnablePromoteAlloca;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f7fe652dbea..43c4be359f4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -94,6 +94,12 @@ static cl::opt<bool> EnableVGPRIndexMode(
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
cl::init(false));
+static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
+ "amdgpu-frame-index-zero-bits",
+ cl::desc("High bits of frame index assumed to be zero"),
+ cl::init(5),
+ cl::ReallyHidden);
+
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -1600,6 +1606,17 @@ SDValue SITargetLowering::LowerFormalArguments(
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+ if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
+ // The return object should be reasonably addressable.
+
+ // FIXME: This helps when the return is a real sret. If it is a
+ // automatically inserted sret (i.e. CanLowerReturn returns false), an
+ // extra copy is inserted in SelectionDAGBuilder which obscures this.
+ unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
+ Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
+ DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
+ }
+
// If this is an 8 or 16-bit value, it is really passed promoted
// to 32 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
@@ -3216,7 +3233,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
-
case ISD::TRAP:
case ISD::DEBUGTRAP:
return lowerTRAP(Op, DAG);
@@ -6997,3 +7013,21 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
TargetLoweringBase::finalizeLowering(MF);
}
+
+void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
+ DAG, Depth);
+
+ if (getSubtarget()->enableHugePrivateBuffer())
+ return;
+
+ // Technically it may be possible to have a dispatch with a single workitem
+ // that uses the full private memory size, but that's not really useful. We
+ // can't use vaddr in MUBUF instructions if we don't know the address
+ // calculation won't overflow, so assume the sign bit is never set.
+ Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 7fab27461b4..f68f7dc28cd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -277,6 +277,12 @@ public:
SDValue V) const;
void finalizeLowering(MachineFunction &MF) const override;
+
+ void computeKnownBitsForFrameIndex(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
};
} // End namespace llvm
OpenPOWER on IntegriCloud