diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-07-09 01:13:56 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-07-09 01:13:56 +0000 |
commit | 1322b6f8bbf39db37ffc717c9fd208f2726ede38 (patch) | |
tree | b564bc4adb22fe878c433a04cfac89b1e2b7b284 | |
parent | 95c7897555f4f99f5ce71f84ab11b63c256293f2 (diff) | |
download | bcm5719-llvm-1322b6f8bbf39db37ffc717c9fd208f2726ede38.tar.gz bcm5719-llvm-1322b6f8bbf39db37ffc717c9fd208f2726ede38.zip |
AMDGPU: Improve offset folding for register indexing
llvm-svn: 274954
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 49 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 30 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 62 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll | 33 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 4 |
7 files changed, 137 insertions, 47 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 157b06b7f46..fcaa6f907e5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -129,6 +129,10 @@ private: bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; + bool selectMOVRELOffsetImpl(SDValue Index, SDValue &Base, + SDValue &Offset, bool IsInsert) const; + bool selectMOVRELSOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; + bool selectMOVRELDOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -1189,6 +1193,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, !isa<ConstantSDNode>(Offset); } +bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index, + SDValue &Base, + SDValue &Offset, + bool IsInsert) const { + SDLoc DL(Index); + + if (CurDAG->isBaseWithConstantOffset(Index)) { + SDValue N0 = Index.getOperand(0); + SDValue N1 = Index.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + + // (add n0, c0) + Base = N0; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); + return true; + } + + if (IsInsert) { + if (ConstantSDNode *CBase = dyn_cast<ConstantSDNode>(Index)) { + Base = CurDAG->getRegister(AMDGPU::NoRegister, MVT::i32); + Offset = CurDAG->getTargetConstant(CBase->getZExtValue(), DL, MVT::i32); + return true; + } + } else { + if (isa<ConstantSDNode>(Index)) + return false; + } + + Base = Index; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::selectMOVRELSOffset(SDValue Index, + SDValue &Base, + SDValue &Offset) const { + return selectMOVRELOffsetImpl(Index, Base, Offset, false); +} + +bool AMDGPUDAGToDAGISel::selectMOVRELDOffset(SDValue Index, + SDValue &Base, + SDValue &Offset) const { + return selectMOVRELOffsetImpl(Index, Base, Offset, true); +} + SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 213cdc310c8..798ff08bef1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1712,7 +1712,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (RegClass != -1) { unsigned Reg = MI.getOperand(i).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) + if (Reg == AMDGPU::NoRegister || + TargetRegisterInfo::isVirtualRegister(Reg)) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 9ae5851190d..88d133f7b68 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -624,6 +624,9 @@ def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">; +def MOVRELSOffset : ComplexPattern<i32, 2, "selectMOVRELSOffset">; +def MOVRELDOffset : ComplexPattern<i32, 2, "selectMOVRELDOffset">; + def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 78bb1c1ca81..3eebd1bad27 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2021,12 +2021,11 @@ let Uses = [EXEC], Defs = [EXEC, VCC, M0], class SI_INDIRECT_SRC<RegisterClass rc> : InstSI < (outs VGPR_32:$vdst, SReg_64:$sdst), - (ins rc:$src, VSrc_32:$idx, i32imm:$offset) ->; + (ins rc:$src, VS_32:$idx, i32imm:$offset)>; class SI_INDIRECT_DST<RegisterClass rc> : InstSI < (outs rc:$vdst, SReg_64:$sdst), - (ins unknown:$src, VSrc_32:$idx, i32imm:$offset, VGPR_32:$val)> { + (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { let Constraints = "$src = $vdst"; } @@ -3308,29 +3307,16 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; /********** ====================== **********/ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { - - // 1. Extract with offset - def : Pat< - (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))), - (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off) - >; - - // 2. Extract without offset - def : Pat< - (eltvt (extractelt vt:$vec, i32:$idx)), - (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0) - >; - - // 3. Insert with offset + // Extract with offset def : Pat< - (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val) + (eltvt (extractelt vt:$src, (MOVRELSOffset i32:$idx, (i32 imm:$offset)))), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) >; - // 4. Insert without offset + // Insert with offset def : Pat< - (insertelt vt:$vec, eltvt:$val, i32:$idx), - (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val) + (insertelt vt:$src, eltvt:$val, (MOVRELDOffset i32:$idx, (i32 imm:$offset))), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) >; } diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 930fcb9c837..1ba14cd2d1c 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -435,7 +435,7 @@ void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) .addReg(AMDGPU::VCC); - if (Offset) { + if (Offset != 0) { BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) .addReg(AMDGPU::M0) .addImm(Offset); @@ -463,7 +463,7 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { - if (Offset) { + if (Offset != 0) { BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) .addImm(Offset); @@ -520,16 +520,17 @@ bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offs return true; } -/// \param @VecReg The register which holds element zero of the vector -/// being addressed into. -/// \param[out] @Reg The base register to use in the indirect addressing instruction. -/// \param[in,out] @Offset As an input, this is the constant offset part of the -// indirect Index. e.g. v0 = v[VecReg + Offset] -// As an output, this is a constant value that needs -// to be added to the value stored in M0. +/// \param @VecReg The register which holds element zero of the vector being +/// addressed into. +// +/// \param[in] @Idx The index operand from the movrel instruction. This must be +// a register, but may be NoRegister. +/// +/// \param[in] @Offset As an input, this is the constant offset part of the +// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant +// value that needs to be added to the value stored in M0. std::pair<unsigned, int> -SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, - int Offset) const { +SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); if (!SubReg) SubReg = VecReg; @@ -560,42 +561,59 @@ SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, // Return true if a new block was inserted. bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); - int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; - std::tie(Reg, Off) = computeIndirectRegAndOffset(SrcVec->getReg(), Off); + std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); + + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + if (Idx->getReg() == AMDGPU::NoRegister) { + // Only had a constant offset, copy the register directly. + BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) + .addReg(Reg, getUndefRegState(SrcVec->isUndef())); + MI.eraseFromParent(); + return false; + } MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) .addReg(Reg, getUndefRegState(SrcVec->isUndef())) .addReg(SrcVec->getReg(), RegState::Implicit); - return loadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Offset); } // Return true if a new block was inserted. bool SILowerControlFlow::indirectDst(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - int Off = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; - std::tie(Reg, Off) = computeIndirectRegAndOffset(Dst, Off); + const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); + std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); + + MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + if (Idx->getReg() == AMDGPU::NoRegister) { + // Only had a constant offset, copy the register directly. + BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) + .addOperand(*Val); + MI.eraseFromParent(); + return false; + } MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) - .addReg(Reg, RegState::Define) + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) .addReg(Dst, RegState::Implicit); - return loadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Offset); } bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll index 9f57dd2f321..66cec88e760 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -452,6 +452,39 @@ entry: ret void } +; Test that the or is folded into the base address register instead of +; added to m0 + +; GCN-LABEL: {{^}}extractelement_v4i32_or_index: +; GCN: s_load_dword [[IDX_IN:s[0-9]+]] +; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] +; GCN-NOT: [[IDX_SHL]] +; GCN: s_mov_b32 m0, [[IDX_SHL]] +; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) { +entry: + %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in + %idx.shl = shl i32 %idx.in, 2 + %idx = or i32 %idx.shl, 1 + %value = extractelement <4 x i32> %ld, i32 %idx + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}insertelement_v4f32_or_index: +; GCN: s_load_dword [[IDX_IN:s[0-9]+]] +; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] +; GCN-NOT: [[IDX_SHL]] +; GCN: s_mov_b32 m0, [[IDX_SHL]] +; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind { + %idx.shl = shl i32 %idx.in, 2 + %idx = or i32 %idx.shl, 1 + %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx + store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 93001e4c139..3e6905f887f 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -339,9 +339,9 @@ endif: ; FIXME: Should be able to manipulate m0 directly instead of add and ; copy. -; GCN: s_or_b32 [[IDX1:s[0-9]+]], [[SCALEDIDX]], 1 +; FIXME: Should avoid resetting m0 to same value ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 -; GCN-DAG: s_mov_b32 m0, [[IDX1]] +; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]] ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] ; GCN: buffer_store_dwordx4 |