diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 107 |
1 files changed, 74 insertions, 33 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dbcd1bf0c76..fd4b1f361cd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4848,6 +4848,70 @@ SDValue SITargetLowering::lowerImage(SDValue Op, return SDValue(NewNode, 0); } +SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, + SDValue Offset, SDValue GLC, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + VT.getStoreSize(), VT.getStoreSize()); + + if (!Offset->isDivergent()) { + SDValue Ops[] = { + Rsrc, + Offset, // Offset + GLC // glc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + DAG.getVTList(VT), Ops, VT, MMO); + } + + // We have a divergent offset. Emit a MUBUF buffer load instead. We can + // assume that the buffer is unswizzled. + SmallVector<SDValue, 4> Loads; + unsigned NumLoads = 1; + MVT LoadVT = VT.getSimpleVT(); + + assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 || + LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32); + + if (VT == MVT::v8i32 || VT == MVT::v16i32) { + NumLoads = VT == MVT::v16i32 ? 4 : 2; + LoadVT = MVT::v4i32; + } + + SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); + unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue(); + SDValue Ops[] = { + DAG.getEntryNode(), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + + // Use the alignment to ensure that the required offsets will fit into the + // immediate offsets. + setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4); + + uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); + for (unsigned i = 0; i < NumLoads; ++i) { + Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32); + Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, + Ops, LoadVT, MMO)); + } + + if (VT == MVT::v8i32 || VT == MVT::v16i32) + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); + + return Loads[0]; +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -5002,38 +5066,15 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); case AMDGPUIntrinsic::SI_load_const: { - SDValue Ops[] = { - Op.getOperand(1), // Ptr - Op.getOperand(2), // Offset - DAG.getTargetConstant(0, DL, MVT::i1) // glc - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - VT.getStoreSize(), 4); - SDVTList VTList = DAG.getVTList(MVT::i32); - SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, - VTList, Ops, MVT::i32, MMO); - + SDValue Load = + lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2), + DAG.getTargetConstant(0, DL, MVT::i1), DAG); return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load); } case Intrinsic::amdgcn_s_buffer_load: { unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - SDValue Ops[] = { - Op.getOperand(1), // Ptr - Op.getOperand(2), // Offset - DAG.getTargetConstant(Cache & 1, DL, MVT::i1) // glc - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - VT.getStoreSize(), VT.getStoreSize()); - return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, - Op->getVTList(), Ops, VT, MMO); + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), + DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); @@ -6068,13 +6109,13 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, - SelectionDAG &DAG, - SDValue *Offsets) const { + SelectionDAG &DAG, SDValue *Offsets, + unsigned Align) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget)) { + if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); @@ -6086,8 +6127,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SDValue N1 = CombinedOffset.getOperand(1); uint32_t SOffset, ImmOffset; int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); - if (Offset >= 0 - && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, Subtarget)) { + if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, + Subtarget, Align)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); |