diff options
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 68 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 39 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 5 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/captured-frame-index.ll | 50 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll | 6 |
7 files changed, 66 insertions, 108 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 951db65efbb..bf3e1da5156 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -51,10 +51,10 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void Select(SDNode *N) override; const char *getPassName() const override; - void PreprocessISelDAG() override; void PostprocessISelDAG() override; private: + SDValue foldFrameIndex(SDValue N) const; bool isInlineImmediate(const SDNode *N) const; bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, const R600InstrInfo *TII); @@ -902,6 +902,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE); } +SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { + if (auto FI = dyn_cast<FrameIndexSDNode>(N)) + return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); + return N; +} + bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const { @@ -921,14 +927,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, // Offsets in vaddr must be positive. ConstantSDNode *C1 = cast<ConstantSDNode>(N1); if (isLegalMUBUFImmOffset(C1)) { - VAddr = N0; + VAddr = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } } // (node) - VAddr = Addr; + VAddr = foldFrameIndex(Addr); ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } @@ -1516,62 +1522,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } -void AMDGPUDAGToDAGISel::PreprocessISelDAG() { - MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo(); - - // Handle the perverse case where a frame index is being stored. We don't - // want to see multiple frame index operands on the same instruction since - // it complicates things and violates some assumptions about frame index - // lowering. - for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); - I != E; ++I) { - SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32); - - // It's possible that we have a frame index defined in the function that - // isn't used in this block. - if (FI.use_empty()) - continue; - - // Skip over the AssertZext inserted during lowering. - SDValue EffectiveFI = FI; - auto It = FI->use_begin(); - if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) { - EffectiveFI = SDValue(*It, 0); - It = EffectiveFI->use_begin(); - } - - for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) { - SDUse &Use = It.getUse(); - SDNode *User = Use.getUser(); - unsigned OpIdx = It.getOperandNo(); - ++It; - - if (MemSDNode *M = dyn_cast<MemSDNode>(User)) { - unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1; - if (OpIdx == PtrIdx) - continue; - - unsigned OpN = M->getNumOperands(); - SDValue NewOps[8]; - - assert(OpN < array_lengthof(NewOps)); - for (unsigned Op = 0; Op != OpN; ++Op) { - if (Op != OpIdx) { - NewOps[Op] = M->getOperand(Op); - continue; - } - - MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - SDLoc(M), MVT::i32, FI); - NewOps[Op] = SDValue(Mov, 0); - } - - CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN)); - } - } - } -} - void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4570fe585d1..f8d4e6131b0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -89,7 +89,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); @@ -1558,7 +1557,6 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { SDValue Result = LowerLOAD(Op, DAG); @@ -1605,43 +1603,6 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) { return nullptr; } -SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { - - SDLoc SL(Op); - FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); - unsigned FrameIndex = FINode->getIndex(); - - // A FrameIndex node represents a 32-bit offset into scratch memory. If the - // high bit of a frame index offset were to be set, this would mean that it - // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch - // buffer, with 64 being the number of threads per wave. - // - // The maximum private allocation for the entire GPU is 4G, and we are - // concerned with the largest the index could ever be for an individual - // workitem. This will occur with the minmum dispatch size. If a program - // requires more, the dispatch size will be reduced. - // - // With this limit, we can mark the high bit of the FrameIndex node as known - // zero, which is important, because it means in most situations we can prove - // that values derived from FrameIndex nodes are non-negative. This enables us - // to take advantage of more addressing modes when accessing scratch buffers, - // since for scratch reads/writes, the register offset must always be - // positive. - - uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; - - // XXX - It is unclear if partial dispatch works. Assume it works at half wave - // granularity. It is probably a full wave. - uint64_t MinGranularity = 32; - - unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); - EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); - - SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); - return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, - DAG.getValueType(ExtVT)); -} - bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 06e7da63a8f..b65f95f7854 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -33,7 +33,6 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 37f8f17bff3..d04ff6a86ec 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -278,6 +278,11 @@ return CurDAG->getTargetConstant( N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32); }]>; +def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{ + auto FI = cast<FrameIndexSDNode>(N); + return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32); +}]>; + // Copied from the AArch64 backend: def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{ return CurDAG->getTargetConstant( diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 7daa1032f05..86c3fd6815e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1582,6 +1582,11 @@ def : Pat < >; def : Pat < + (i32 frameindex:$fi), + (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi))) +>; + +def : Pat < (i64 InlineImm<i64>:$imm), (S_MOV_B64 InlineImm<i64>:$imm) >; diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll index 161c46b486e..55b32305687 100644 --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,5 +1,17 @@ ; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; GCN-LABEL: {{^}}store_fi_lifetime: +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[FI]] +define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 { +entry: + %b = alloca i8 + call void @llvm.lifetime.start(i64 1, i8* %b) + store volatile i8* %b, i8* addrspace(1)* undef + call void @llvm.lifetime.end(i64 1, i8* %b) + ret void +} + ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] ; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} @@ -140,17 +152,18 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 { } ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset: -; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc +; GCN: v_mov_b32_e32 [[VAL_0:v[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; GCN: buffer_store_dword [[VAL_0]], [[BASE_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen + +; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}} +; GCN: v_add_i32_e32 [[BASE_1_OFF_0:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]] -; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]] ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56 -; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 56, [[BASE_0_1]] +; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} -; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]] -; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN: buffer_store_dword [[BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { %tmp0 = alloca [4096 x i32] %tmp1 = alloca [4096 x i32] @@ -163,4 +176,27 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 { ret void } +@g1 = external addrspace(1) global i32* + +; This was leaving a dead node around resulting in failing to select +; on the leftover AssertZext's ValueType operand. + +; GCN-LABEL: {{^}}cannot_select_assertzext_valuetype: +; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, g1@GOTPCREL+4 +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[FI]] +define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 { +entry: + %b = alloca i32, align 4 + %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4 + %arrayidx = getelementptr inbounds i32, i32* %tmp1, i32 %idx + %tmp2 = load i32, i32* %arrayidx, align 4 + store volatile i32* %b, i32* addrspace(1)* undef + ret void +} + +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll index eb554e21730..105d20511b8 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-slot-bug.ll @@ -7,8 +7,10 @@ ; ; CHECK-LABEL: {{^}}main: ; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 -; CHECK: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]] -; CHECK: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]] +; CHECK-DAG: v_mov_b32_e32 [[ZERO_BASE_FI:v[0-9]+]], 0{{$}} +; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]] +; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]] + ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen define amdgpu_ps float @main(i32 %idx) { |

