diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 107 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 185 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/smrd-fold-offset.mir | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/smrd.ll | 50 |
8 files changed, 242 insertions, 126 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 672784a9873..254f1362f1f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4847,70 +4847,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, return SDValue(NewNode, 0); } -SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, - SDValue Offset, SDValue GLC, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - VT.getStoreSize(), VT.getStoreSize()); - - if (!Offset->isDivergent()) { - SDValue Ops[] = { - Rsrc, - Offset, // Offset - GLC // glc - }; - return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, - DAG.getVTList(VT), Ops, VT, MMO); - } - - // We have a divergent offset. Emit a MUBUF buffer load instead. We can - // assume that the buffer is unswizzled. - SmallVector<SDValue, 4> Loads; - unsigned NumLoads = 1; - MVT LoadVT = VT.getSimpleVT(); - - assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 || - LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32); - - if (VT == MVT::v8i32 || VT == MVT::v16i32) { - NumLoads = VT == MVT::v16i32 ? 4 : 2; - LoadVT = MVT::v4i32; - } - - SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); - unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue(); - SDValue Ops[] = { - DAG.getEntryNode(), // Chain - Rsrc, // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - {}, // voffset - {}, // soffset - {}, // offset - DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen - }; - - // Use the alignment to ensure that the required offsets will fit into the - // immediate offsets. - setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4); - - uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); - for (unsigned i = 0; i < NumLoads; ++i) { - Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32); - Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, - Ops, LoadVT, MMO)); - } - - if (VT == MVT::v8i32 || VT == MVT::v16i32) - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); - - return Loads[0]; -} - SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -5065,15 +5001,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); case AMDGPUIntrinsic::SI_load_const: { - SDValue Load = - lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2), - DAG.getTargetConstant(0, DL, MVT::i1), DAG); + SDValue Ops[] = { + Op.getOperand(1), // Ptr + Op.getOperand(2), // Offset + DAG.getTargetConstant(0, DL, MVT::i1) // glc + }; + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + VT.getStoreSize(), 4); + SDVTList VTList = DAG.getVTList(MVT::i32); + SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + VTList, Ops, MVT::i32, MMO); + return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load); } case Intrinsic::amdgcn_s_buffer_load: { unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), - DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG); + SDValue Ops[] = { + Op.getOperand(1), // Ptr + Op.getOperand(2), // Offset + DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc + }; + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + VT.getStoreSize(), VT.getStoreSize()); + return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + Op->getVTList(), Ops, VT, MMO); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); @@ -6108,13 +6067,13 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, - SelectionDAG &DAG, SDValue *Offsets, - unsigned Align) const { + SelectionDAG &DAG, + SDValue *Offsets) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { + if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); @@ -6126,8 +6085,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SDValue N1 = CombinedOffset.getOperand(1); uint32_t SOffset, ImmOffset; int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); - if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - Subtarget, Align)) { + if (Offset >= 0 + && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index d12c3ae4dba..73fa05ea58f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -60,8 +60,6 @@ private: MVT VT, unsigned Offset) const; SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const; - SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, - SDValue GLC, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; @@ -192,7 +190,7 @@ private: // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, - SDValue *Offsets, unsigned Align = 4) const; + SDValue *Offsets) const; public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 4dd06df1233..562428ef37c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3558,13 +3558,8 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, // pointer value is uniform. MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); - SBase->setReg(SGPR); - } - MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); - if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); - SOff->setReg(SGPR); + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); } } @@ -4193,6 +4188,115 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); Inst.eraseFromParent(); continue; + + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: { + unsigned VDst; + unsigned NewOpcode; + + switch(Opcode) { + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_OFFEN; + VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + VDst = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; + VDst = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: + splitScalarBuffer(Worklist, Inst); + Inst.eraseFromParent(); + continue; + } + + const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); + auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); + unsigned Offset = 0; + + // FIXME: This isn't safe because the addressing mode doesn't work + // correctly if vaddr is negative. + // + // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. + // + // See if we can extract an immediate offset by recognizing one of these: + // V_ADD_I32_e32 dst, imm, src1 + // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 + // V_ADD will be removed by "Remove dead machine instructions". + if (Add && + (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || + Add->getOpcode() == AMDGPU::V_ADD_U32_e32 || + Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { + static const unsigned SrcNames[2] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + }; + + // Find a literal offset in one of source operands. + for (int i = 0; i < 2; i++) { + const MachineOperand *Src = + getNamedOperand(*Add, SrcNames[i]); + + if (Src->isReg()) { + MachineInstr *Def = MRI.getUniqueVRegDef(Src->getReg()); + if (Def) { + if (Def->isMoveImmediate()) + Src = &Def->getOperand(1); + else if (Def->isCopy()) { + auto Mov = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); + if (Mov && Mov->isMoveImmediate()) { + Src = &Mov->getOperand(1); + } + } + } + } + + if (Src) { + if (Src->isImm()) + Offset = Src->getImm(); + else if (Src->isCImm()) + Offset = Src->getCImm()->getZExtValue(); + } + + if (Offset && isLegalMUBUFImmOffset(Offset)) { + VAddr = getNamedOperand(*Add, SrcNames[!i]); + break; + } + + Offset = 0; + } + } + + MachineInstr *NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), + get(NewOpcode), VDst) + .add(*VAddr) // vaddr + .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc + .addImm(0) // soffset + .addImm(Offset) // offset + .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) + .addImm(0) // slc + .addImm(0) // tfe + .cloneMemRefs(Inst) + .getInstr(); + + MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), + VDst); + addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); + Inst.eraseFromParent(); + + // Legalize all operands other than the offset. Notably, convert the srsrc + // into SGPRs using v_readfirstlane if needed. + legalizeOperands(*NewInstr, MDT); + continue; + } } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -4674,6 +4778,73 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::splitScalarBuffer(SetVectorType &Worklist, + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineBasicBlock::iterator MII = Inst; + auto &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = *getNamedOperand(Inst, AMDGPU::OpName::sdst);; + MachineOperand &Rsrc = *getNamedOperand(Inst, AMDGPU::OpName::sbase); + MachineOperand &Offset = *getNamedOperand(Inst, AMDGPU::OpName::soff); + MachineOperand &Glc = *getNamedOperand(Inst, AMDGPU::OpName::glc); + + unsigned Opcode = Inst.getOpcode(); + unsigned NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN; + unsigned Count = 0; + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + + switch(Opcode) { + default: + return; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + Count = 2; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: + Count = 4; + break; + } + + // FIXME: Should also attempt to build VAddr and Offset like the non-split + // case (see call site for this function) + + // Create a vector of result registers + SmallVector<unsigned, 8> ResultRegs; + for (unsigned i = 0; i < Count ; ++i) { + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_128RegClass); + MachineInstr &NewMI = *BuildMI(MBB, MII, DL, get(NewOpcode), ResultReg) + .addReg(Offset.getReg()) // offset + .addReg(Rsrc.getReg()) // rsrc + .addImm(0) // soffset + .addImm(i << 4) // inst_offset + .addImm(Glc.getImm()) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addMemOperand(*Inst.memoperands_begin()); + // Extract the 4 32 bit sub-registers from the result to add into the final REG_SEQUENCE + auto &NewDestOp = NewMI.getOperand(0); + for (unsigned i = 0 ; i < 4 ; i++) + ResultRegs.push_back(buildExtractSubReg(MII, MRI, NewDestOp, &AMDGPU::VReg_128RegClass, + RI.getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass)); + } + // Create a new combined result to replace original with + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); + MachineInstrBuilder CombinedResBuilder = BuildMI(MBB, MII, DL, + get(TargetOpcode::REG_SEQUENCE), FullDestReg); + + for (unsigned i = 0 ; i < Count * 4 ; ++i) { + CombinedResBuilder + .addReg(ResultRegs[i]) + .addImm(RI.getSubRegFromChannel(i)); + } + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 34cac88cbf1..2f51b199950 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -103,6 +103,8 @@ private: MachineInstr &Inst) const; void splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const; + void splitScalarBuffer(SetVectorType &Worklist, + MachineInstr &Inst) const; void movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 9d567579d71..634ec8fcc3d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -894,12 +894,9 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { // Given Imm, split it into the values to put into the SOffset and ImmOffset // fields in an MUBUF instruction. Return false if it is not possible (due to a // hardware bug needing a workaround). -// -// The required alignment ensures that individual address components remain -// aligned if they are aligned to begin with. It also ensures that additional -// offsets within the given alignment can be added to the resulting ImmOffset. bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, uint32_t Align) { + const GCNSubtarget *Subtarget) { + const uint32_t Align = 4; const uint32_t MaxImm = alignDown(4095, Align); uint32_t Overflow = 0; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index af5ab9bf269..d45f4249869 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -441,8 +441,11 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); /// not the encoded offset. bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); +// Given Imm, split it into the values to put into the SOffset and ImmOffset +// fields in an MUBUF instruction. Return false if it is not possible (due to a +// hardware bug needing a workaround). bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, uint32_t Align = 4); + const GCNSubtarget *Subtarget); /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/llvm/test/CodeGen/AMDGPU/smrd-fold-offset.mir b/llvm/test/CodeGen/AMDGPU/smrd-fold-offset.mir index 10601ccaeb7..44954f06523 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd-fold-offset.mir +++ b/llvm/test/CodeGen/AMDGPU/smrd-fold-offset.mir @@ -1,8 +1,6 @@ # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies -o - %s | FileCheck -check-prefix=GCN %s -# GCN-LABEL: name: smrd_vgpr_offset_imm -# GCN: V_READFIRSTLANE_B32 -# GCN: S_BUFFER_LOAD_DWORD_SGPR +# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095 --- name: smrd_vgpr_offset_imm body: | @@ -24,9 +22,7 @@ body: | SI_RETURN_TO_EPILOG $vgpr0 ... -# GCN-LABEL: name: smrd_vgpr_offset_imm_add_u32 -# GCN: V_READFIRSTLANE_B32 -# GCN: S_BUFFER_LOAD_DWORD_SGPR +# GCN: BUFFER_LOAD_DWORD_OFFEN %{{[0-9]+}}, killed %{{[0-9]+}}, 0, 4095 --- name: smrd_vgpr_offset_imm_add_u32 body: | diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index f453cfdbd1f..c87145a1a5b 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -292,19 +292,18 @@ main_body: ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm: ; GCN-NEXT: %bb. -; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4092 ; +; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen offset:4095 ; define amdgpu_ps float @smrd_vgpr_offset_imm(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: - %off = add i32 %offset, 4092 + %off = add i32 %offset, 4095 %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %off) ret float %r } ; GCN-LABEL: {{^}}smrd_vgpr_offset_imm_too_large: ; GCN-NEXT: %bb. -; SICI-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0 -; SICI-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; -; VIGFX9-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 4 offen offset:4092 ; +; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}0x1000, v0 +; GCN-NEXT: buffer_load_dword v{{[0-9]}}, v0, s[0:3], 0 offen ; define amdgpu_ps float @smrd_vgpr_offset_imm_too_large(<4 x i32> inreg %desc, i32 %offset) #0 { main_body: %off = add i32 %offset, 4096 @@ -511,15 +510,12 @@ main_body: } ; GCN-LABEL: {{^}}smrd_load_nonconst4: -; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ; -; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; -; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; -; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; -; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; -; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 56 offen offset:4032 ; -; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 56 offen offset:4048 ; -; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 56 offen offset:4064 ; -; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 56 offen offset:4080 ; +; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ; +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0xff8, v0 ; +; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; +; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; +; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; +; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; ; GCN: ; return to shader part epilog define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 { main_body: @@ -530,16 +526,12 @@ main_body: } ; GCN-LABEL: {{^}}smrd_load_nonconst5: -; SICI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x1004, v0 -; SICI-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; -; SICI-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; -; SICI-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; -; SICI-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; -; VIGFX9: s_movk_i32 s4, 0xfc0 -; VIGFX9-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], s4 offen offset:68 ; -; VIGFX9-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], s4 offen offset:84 ; -; VIGFX9-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], s4 offen offset:100 ; -; VIGFX9-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], s4 offen offset:116 ; +; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0x1004, v0 +; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x1004, v0 +; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ; +; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ; +; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ; +; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ; ; GCN: ; return to shader part epilog define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 { main_body: @@ -567,10 +559,9 @@ main_body: ; GCN-LABEL: {{^}}smrd_uniform_loop: ; -; TODO: we should keep the loop counter in an SGPR +; TODO: this should use an s_buffer_load ; -; GCN: v_readfirstlane_b32 -; GCN: s_buffer_load_dword +; GCN: buffer_load_dword define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 { main_body: br label %loop @@ -594,10 +585,9 @@ exit: ; (this test differs from smrd_uniform_loop by the more complex structure of phis, ; which used to confuse the DivergenceAnalysis after structurization) ; -; TODO: we should keep the loop counter in an SGPR +; TODO: we should keep the loop counter in an SGPR and use an S_BUFFER_LOAD ; -; GCN: v_readfirstlane_b32 -; GCN: s_buffer_load_dword +; GCN: buffer_load_dword define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 { main_body: br label %loop |