diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 36 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 6 |
6 files changed, 60 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index bd47eadcafc..faa9a41c96a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -346,6 +346,13 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; +def FeatureEnableHugePrivateBuffer : SubtargetFeature< + "huge-private-buffer", + "EnableHugePrivateBuffer", + "true", + "Enable private/scratch buffer sizes greater than 128 GB" +>; + def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 86a0dab30ea..c9cefe3d2da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1160,8 +1160,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue N1 = Addr.getOperand(1); // Offsets in vaddr must be positive. + // + // The total computation of vaddr + soffset + offset must not overflow. + // If vaddr is negative, even if offset is 0 the sgpr offset add will end up + // overflowing. ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) && + CurDAG->SignBitIsZero(N0)) { std::tie(VAddr, SOffset) = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 83122281d2b..8e5a432e068 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -121,6 +121,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DebuggerReserveRegs(false), DebuggerEmitPrologue(false), + EnableHugePrivateBuffer(false), EnableVGPRSpilling(false), EnablePromoteAlloca(false), EnableLoadStoreOpt(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 460ff82efc5..f9b400cfe1b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -130,6 +130,7 @@ protected: bool DebuggerEmitPrologue; // Used as options. + bool EnableHugePrivateBuffer; bool EnableVGPRSpilling; bool EnablePromoteAlloca; bool EnableLoadStoreOpt; @@ -351,6 +352,10 @@ public: return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } + bool enableHugePrivateBuffer() const { + return EnableHugePrivateBuffer; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f7fe652dbea..43c4be359f4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -94,6 +94,12 @@ static cl::opt<bool> EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); +static cl::opt<unsigned> AssumeFrameIndexHighZeroBits( + "amdgpu-frame-index-zero-bits", + cl::desc("High bits of frame index assumed to be zero"), + cl::init(5), + cl::ReallyHidden); + static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -1600,6 +1606,17 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) { + // The return object should be reasonably addressable. + + // FIXME: This helps when the return is a real sret. If it is a + // automatically inserted sret (i.e. CanLowerReturn returns false), an + // extra copy is inserted in SelectionDAGBuilder which obscures this. + unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; + Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, + DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); + } + // If this is an 8 or 16-bit value, it is really passed promoted // to 32 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. @@ -3216,7 +3233,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); - case ISD::TRAP: case ISD::DEBUGTRAP: return lowerTRAP(Op, DAG); @@ -6997,3 +7013,21 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { TargetLoweringBase::finalizeLowering(MF); } + +void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth) const { + TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, + DAG, Depth); + + if (getSubtarget()->enableHugePrivateBuffer()) + return; + + // Technically it may be possible to have a dispatch with a single workitem + // that uses the full private memory size, but that's not really useful. We + // can't use vaddr in MUBUF instructions if we don't know the address + // calculation won't overflow, so assume the sign bit is never set. + Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 7fab27461b4..f68f7dc28cd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -277,6 +277,12 @@ public: SDValue V) const; void finalizeLowering(MachineFunction &MF) const override; + + void computeKnownBitsForFrameIndex(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; }; } // End namespace llvm |