diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 15 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/BUFInstructions.td | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/MIMGInstructions.td | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 499 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 20 |
7 files changed, 275 insertions, 293 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index db7a4ab521a..84768e632d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -799,6 +799,9 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<"FeatureVOP3P">; +def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">, + AssemblerPredicate<"!FeatureVOP3P">; + def HasSDWA : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e8720e7a4c5..1b3b9d1e851 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3007,7 +3007,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue X = LHS->getOperand(0); if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && - isTypeLegal(MVT::v2i16)) { + isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { // Prefer build_vector as the canonical form if packed types are legal. // (shl ([asz]ext i16:x), 16 -> build_vector 0, x SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, @@ -3818,12 +3818,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, // TODO: Generalize and move to DAGCombiner SDValue Src = N->getOperand(0); if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { - assert(Src.getValueType() == MVT::i64); - SDLoc SL(N); - uint64_t CVal = C->getZExtValue(); - return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, - DAG.getConstant(Lo_32(CVal), SL, MVT::i32), - DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + if (Src.getValueType() == MVT::i64) { + SDLoc SL(N); + uint64_t CVal = C->getZExtValue(); + return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + } } if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 999d5534050..038a4aa076a 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1060,14 +1060,14 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_X defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">; let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f16, "BUFFER_LOAD_FORMAT_D16_X">; - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_XY">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. @@ -1547,14 +1547,14 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">; let SubtargetPredicate = HasUnpackedD16VMem in { - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f16, "TBUFFER_LOAD_FORMAT_D16_X">; - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_XY">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index fb89e7c8d0e..3779e751ec7 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -610,10 +610,7 @@ multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I, let SubtargetPredicate = HasPackedD16VMem in { def _packed_v1 : ImageDimPattern<I, "_V1", f16, "_D16">; - // used on gfx810 - def _packed_v2 : ImageDimPattern<d16helper, "_V1", i32, "_D16">; - // used on gfx900 - def _packed_v2_gfx9 : ImageDimPattern<I, "_V1", v2f16, "_D16">; + def _packed_v2 : ImageDimPattern<I, "_V1", v2f16, "_D16">; def _packed_v4 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16">; } // End HasPackedD16VMem. } @@ -717,7 +714,7 @@ multiclass ImageSampleAltPatterns<SDPatternOperator name, string opcode> { } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), i32, "_D16">; + defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">; defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">; } // End HasPackedD16VMem. } @@ -780,7 +777,7 @@ multiclass ImageLoadAltPatterns<SDPatternOperator name, string opcode> { } // End HasUnPackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), i32, "_D16">; + defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">; defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">; } // End HasPackedD16VMem. } @@ -865,8 +862,8 @@ defm : ImageLoadAltPatterns<SIImage_load, "IMAGE_LOAD">; defm : ImageLoadAltPatterns<SIImage_load_mip, "IMAGE_LOAD_MIP">; // Image store. -defm : ImageStorePatterns<SIImage_store, "IMAGE_STORE">; -defm : ImageStorePatterns<SIImage_store_mip, "IMAGE_STORE_MIP">; +defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">; +defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">; defm : ImageStoreAltPatterns<SIImage_store, "IMAGE_STORE">; defm : ImageStoreAltPatterns<SIImage_store_mip, "IMAGE_STORE_MIP">; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 41103970b09..a074e557f24 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -139,9 +139,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); - } - if (Subtarget->hasVOP3PInsts()) { + // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); } @@ -174,7 +173,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); @@ -423,9 +421,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f16, Legal); if (!Subtarget->hasFP16Denormals()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - } - if (Subtarget->hasVOP3PInsts()) { for (MVT VT : {MVT::v2i16, MVT::v2f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { @@ -472,11 +468,34 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); setOperationAction(ISD::XOR, MVT::v2i16, Promote); AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2i16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2f16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v4i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + + if (!Subtarget->hasVOP3PInsts()) { + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); + } + + setOperationAction(ISD::FNEG, MVT::v2f16, Legal); + // This isn't really legal, but this avoids the legalizer unrolling it (and + // allows matching fneg (fabs x) patterns) + setOperationAction(ISD::FABS, MVT::v2f16, Legal); + } + + if (Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::ADD, MVT::v2i16, Legal); setOperationAction(ISD::SUB, MVT::v2i16, Legal); setOperationAction(ISD::MUL, MVT::v2i16, Legal); @@ -489,25 +508,23 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMAX, MVT::v2i16, Legal); setOperationAction(ISD::FADD, MVT::v2f16, Legal); - setOperationAction(ISD::FNEG, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); - // This isn't really legal, but this avoids the legalizer unrolling it (and - // allows matching fneg (fabs x) patterns) - setOperationAction(ISD::FABS, MVT::v2f16, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + } - setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::SELECT, MVT::v2i16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2f16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); } else { + // Legalization hack. setOperationAction(ISD::SELECT, MVT::v2i16, Custom); setOperationAction(ISD::SELECT, MVT::v2f16, Custom); } @@ -3514,205 +3531,72 @@ static unsigned getImageOpcode(unsigned IID) { return 0; } -static SDValue adjustLoadValueType(SDValue Result, EVT LoadVT, SDLoc DL, - SelectionDAG &DAG, bool Unpacked) { +static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, + const SDLoc &DL, + SelectionDAG &DAG, bool Unpacked) { + if (!LoadVT.isVector()) + return Result; + if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. // Truncate to v2i16/v4i16. EVT IntLoadVT = LoadVT.changeTypeToInteger(); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntLoadVT, Result); + + // Workaround legalizer not scalarizing truncate after vector op + // legalization byt not creating intermediate vector trunc. + SmallVector<SDValue, 4> Elts; + DAG.ExtractVectorElements(Result, Elts); + for (SDValue &Elt : Elts) + Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); + + Result = DAG.getBuildVector(IntLoadVT, DL, Elts); + // Bitcast to original type (v2f16/v4f16). - return DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc); + return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); } + // Cast back to the original packed type. return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); } -// This is to lower INTRINSIC_W_CHAIN with illegal result types. -SDValue SITargetLowering::lowerIntrinsicWChain_IllegalReturnType(SDValue Op, - SDValue &Chain, SelectionDAG &DAG) const { - EVT LoadVT = Op.getValueType(); - // TODO: handle v3f16. - if (LoadVT != MVT::v2f16 && LoadVT != MVT::v4f16) - return SDValue(); +SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, + MemSDNode *M, + SelectionDAG &DAG, + bool IsIntrinsic) const { + SDLoc DL(M); + SmallVector<SDValue, 10> Ops; + Ops.reserve(M->getNumOperands()); - bool Unpacked = Subtarget->hasUnpackedD16VMem(); - EVT UnpackedLoadVT = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; - EVT EquivLoadVT = Unpacked ? UnpackedLoadVT : - getEquivalentMemType(*DAG.getContext(), LoadVT); - // Change from v4f16/v2f16 to EquivLoadVT. - SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); + Ops.push_back(M->getOperand(0)); + if (IsIntrinsic) + Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32)); - SDValue Res; - SDLoc DL(Op); - MemSDNode *M = cast<MemSDNode>(Op); - unsigned IID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - switch (IID) { - case Intrinsic::amdgcn_tbuffer_load: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // voffset - Op.getOperand(5), // soffset - Op.getOperand(6), // offset - Op.getOperand(7), // dfmt - Op.getOperand(8), // nfmt - Op.getOperand(9), // glc - Op.getOperand(10) // slc - }; - Res = DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, DL, - VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - case Intrinsic::amdgcn_buffer_load_format: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Op.getOperand(4), // offset - Op.getOperand(5), // glc - Op.getOperand(6) // slc - }; - Res = DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - case Intrinsic::amdgcn_image_load: - case Intrinsic::amdgcn_image_load_mip: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vaddr - Op.getOperand(3), // rsrc - Op.getOperand(4), // dmask - Op.getOperand(5), // glc - Op.getOperand(6), // slc - Op.getOperand(7), // lwe - Op.getOperand(8) // da - }; - unsigned Opc = getImageOpcode(IID); - Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - // Basic sample. - case Intrinsic::amdgcn_image_sample: - case Intrinsic::amdgcn_image_sample_cl: - case Intrinsic::amdgcn_image_sample_d: - case Intrinsic::amdgcn_image_sample_d_cl: - case Intrinsic::amdgcn_image_sample_l: - case Intrinsic::amdgcn_image_sample_b: - case Intrinsic::amdgcn_image_sample_b_cl: - case Intrinsic::amdgcn_image_sample_lz: - case Intrinsic::amdgcn_image_sample_cd: - case Intrinsic::amdgcn_image_sample_cd_cl: - - // Sample with comparison. - case Intrinsic::amdgcn_image_sample_c: - case Intrinsic::amdgcn_image_sample_c_cl: - case Intrinsic::amdgcn_image_sample_c_d: - case Intrinsic::amdgcn_image_sample_c_d_cl: - case Intrinsic::amdgcn_image_sample_c_l: - case Intrinsic::amdgcn_image_sample_c_b: - case Intrinsic::amdgcn_image_sample_c_b_cl: - case Intrinsic::amdgcn_image_sample_c_lz: - case Intrinsic::amdgcn_image_sample_c_cd: - case Intrinsic::amdgcn_image_sample_c_cd_cl: - - // Sample with offsets. - case Intrinsic::amdgcn_image_sample_o: - case Intrinsic::amdgcn_image_sample_cl_o: - case Intrinsic::amdgcn_image_sample_d_o: - case Intrinsic::amdgcn_image_sample_d_cl_o: - case Intrinsic::amdgcn_image_sample_l_o: - case Intrinsic::amdgcn_image_sample_b_o: - case Intrinsic::amdgcn_image_sample_b_cl_o: - case Intrinsic::amdgcn_image_sample_lz_o: - case Intrinsic::amdgcn_image_sample_cd_o: - case Intrinsic::amdgcn_image_sample_cd_cl_o: + // Skip 1, as it is the intrinsic ID. + for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I) + Ops.push_back(M->getOperand(I)); - // Sample with comparison and offsets. - case Intrinsic::amdgcn_image_sample_c_o: - case Intrinsic::amdgcn_image_sample_c_cl_o: - case Intrinsic::amdgcn_image_sample_c_d_o: - case Intrinsic::amdgcn_image_sample_c_d_cl_o: - case Intrinsic::amdgcn_image_sample_c_l_o: - case Intrinsic::amdgcn_image_sample_c_b_o: - case Intrinsic::amdgcn_image_sample_c_b_cl_o: - case Intrinsic::amdgcn_image_sample_c_lz_o: - case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + bool Unpacked = Subtarget->hasUnpackedD16VMem(); + EVT LoadVT = M->getValueType(0); - // Basic gather4 - case Intrinsic::amdgcn_image_gather4: - case Intrinsic::amdgcn_image_gather4_cl: - case Intrinsic::amdgcn_image_gather4_l: - case Intrinsic::amdgcn_image_gather4_b: - case Intrinsic::amdgcn_image_gather4_b_cl: - case Intrinsic::amdgcn_image_gather4_lz: + EVT UnpackedLoadVT = LoadVT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()) : LoadVT; + EVT EquivLoadVT = LoadVT; + if (LoadVT.isVector()) { + EquivLoadVT = Unpacked ? UnpackedLoadVT : + getEquivalentMemType(*DAG.getContext(), LoadVT); + } - // Gather4 with comparison - case Intrinsic::amdgcn_image_gather4_c: - case Intrinsic::amdgcn_image_gather4_c_cl: - case Intrinsic::amdgcn_image_gather4_c_l: - case Intrinsic::amdgcn_image_gather4_c_b: - case Intrinsic::amdgcn_image_gather4_c_b_cl: - case Intrinsic::amdgcn_image_gather4_c_lz: + // Change from v4f16/v2f16 to EquivLoadVT. + SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); - // Gather4 with offsets - case Intrinsic::amdgcn_image_gather4_o: - case Intrinsic::amdgcn_image_gather4_cl_o: - case Intrinsic::amdgcn_image_gather4_l_o: - case Intrinsic::amdgcn_image_gather4_b_o: - case Intrinsic::amdgcn_image_gather4_b_cl_o: - case Intrinsic::amdgcn_image_gather4_lz_o: + SDValue Load + = DAG.getMemIntrinsicNode(IsIntrinsic ? ISD::INTRINSIC_W_CHAIN : Opcode, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); - // Gather4 with comparison and offsets - case Intrinsic::amdgcn_image_gather4_c_o: - case Intrinsic::amdgcn_image_gather4_c_cl_o: - case Intrinsic::amdgcn_image_gather4_c_l_o: - case Intrinsic::amdgcn_image_gather4_c_b_o: - case Intrinsic::amdgcn_image_gather4_c_b_cl_o: - case Intrinsic::amdgcn_image_gather4_c_lz_o: { - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vaddr - Op.getOperand(3), // rsrc - Op.getOperand(4), // sampler - Op.getOperand(5), // dmask - Op.getOperand(6), // unorm - Op.getOperand(7), // glc - Op.getOperand(8), // slc - Op.getOperand(9), // lwe - Op.getOperand(10) // da - }; - unsigned Opc = getImageOpcode(IID); - Res = DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, M->getMemoryVT(), - M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } - default: { - const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = - AMDGPU::lookupD16ImageDimIntrinsicByIntr(IID); - if (D16ImageDimIntr) { - SmallVector<SDValue, 20> Ops; - for (auto Value : Op.getNode()->op_values()) - Ops.push_back(Value); - Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); - Res = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTList, Ops, - M->getMemoryVT(), M->getMemOperand()); - Chain = Res.getValue(1); - return adjustLoadValueType(Res, LoadVT, DL, DAG, Unpacked); - } + SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); - return SDValue(); - } - } + return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); } void SITargetLowering::ReplaceNodeResults(SDNode *N, @@ -3767,13 +3651,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, break; } case ISD::INTRINSIC_W_CHAIN: { - SDValue Chain; - if (SDValue Res = lowerIntrinsicWChain_IllegalReturnType(SDValue(N, 0), - Chain, DAG)) { + if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { Results.push_back(Res); - Results.push_back(Chain); + Results.push_back(Res.getValue(1)); return; } + break; } case ISD::SELECT: { @@ -4279,22 +4162,24 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16); - EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + assert(VT == MVT::v2f16 || VT == MVT::v2i16); + + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); + + Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); + Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); - // Turn into pair of packed build_vectors. - // TODO: Special case for constants that can be materialized with s_mov_b64. - SDValue Lo = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(0), Op.getOperand(1) }); - SDValue Hi = DAG.getBuildVector(HalfVT, SL, - { Op.getOperand(2), Op.getOperand(3) }); + Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); + Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); - SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); - SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); + SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, + DAG.getConstant(16, SL, MVT::i32)); - SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); - return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); + + return DAG.getNode(ISD::BITCAST, SL, VT, Or); } bool @@ -4829,13 +4714,23 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); - auto *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG); + } + SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4849,10 +4744,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(10) // slc }; - EVT VT = Op.getValueType(); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, VT, M->getMemOperand()); + Op->getVTList(), Ops, LoadVT, + M->getMemOperand()); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -4933,6 +4827,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_image_load: + case Intrinsic::amdgcn_image_load_mip: { + EVT LoadVT = Op.getValueType(); + if ((Subtarget->hasUnpackedD16VMem() && LoadVT == MVT::v2f16) || + LoadVT == MVT::v4f16) { + MemSDNode *M = cast<MemSDNode>(Op); + return adjustLoadValueType(getImageOpcode(IntrID), M, DAG); + } + + return SDValue(); + } + // Basic sample. case Intrinsic::amdgcn_image_sample: case Intrinsic::amdgcn_image_sample_cl: @@ -4979,7 +4885,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_image_sample_c_b_cl_o: case Intrinsic::amdgcn_image_sample_c_lz_o: case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: { + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: + + // Basic gather4 + case Intrinsic::amdgcn_image_gather4: + case Intrinsic::amdgcn_image_gather4_cl: + case Intrinsic::amdgcn_image_gather4_l: + case Intrinsic::amdgcn_image_gather4_b: + case Intrinsic::amdgcn_image_gather4_b_cl: + case Intrinsic::amdgcn_image_gather4_lz: + + // Gather4 with comparison + case Intrinsic::amdgcn_image_gather4_c: + case Intrinsic::amdgcn_image_gather4_c_cl: + case Intrinsic::amdgcn_image_gather4_c_l: + case Intrinsic::amdgcn_image_gather4_c_b: + case Intrinsic::amdgcn_image_gather4_c_b_cl: + case Intrinsic::amdgcn_image_gather4_c_lz: + + // Gather4 with offsets + case Intrinsic::amdgcn_image_gather4_o: + case Intrinsic::amdgcn_image_gather4_cl_o: + case Intrinsic::amdgcn_image_gather4_l_o: + case Intrinsic::amdgcn_image_gather4_b_o: + case Intrinsic::amdgcn_image_gather4_b_cl_o: + case Intrinsic::amdgcn_image_gather4_lz_o: + + // Gather4 with comparison and offsets + case Intrinsic::amdgcn_image_gather4_c_o: + case Intrinsic::amdgcn_image_gather4_c_cl_o: + case Intrinsic::amdgcn_image_gather4_c_l_o: + case Intrinsic::amdgcn_image_gather4_c_b_o: + case Intrinsic::amdgcn_image_gather4_c_b_cl_o: + case Intrinsic::amdgcn_image_gather4_c_lz_o: { // Replace dmask with everything disabled with undef. const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5)); if (!DMask || DMask->isNullValue()) { @@ -4987,9 +4925,32 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); } + if ((Subtarget->hasUnpackedD16VMem() && Op.getValueType() == MVT::v2f16) || + Op.getValueType() == MVT::v4f16) { + return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op), + DAG); + } + return SDValue(); } default: + EVT LoadVT = Op.getValueType(); + if (LoadVT.getScalarSizeInBits() != 16) + return SDValue(); + + const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = + AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID); + if (D16ImageDimIntr) { + bool Unpacked = Subtarget->hasUnpackedD16VMem(); + MemSDNode *M = cast<MemSDNode>(Op); + + if (isTypeLegal(LoadVT) && (!Unpacked || LoadVT == MVT::f16)) + return SDValue(); + + return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr, + M, DAG, true); + } + return SDValue(); } } @@ -4997,26 +4958,32 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG) const { EVT StoreVT = VData.getValueType(); + + // No change for f16 and legal vector D16 types. + if (!StoreVT.isVector()) + return VData; + SDLoc DL(VData); + assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); - if (StoreVT.isVector()) { - assert ((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); - if (!Subtarget->hasUnpackedD16VMem()) { - if (!isTypeLegal(StoreVT)) { - // If Target supports packed vmem, we just need to workaround - // the illegal type by casting to an equivalent one. - EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); - return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); - } - } else { // We need to unpack the packed data to store. - EVT IntStoreVT = StoreVT.changeTypeToInteger(); - SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); - EVT EquivStoreVT = (StoreVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; - return DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); - } + if (Subtarget->hasUnpackedD16VMem()) { + // We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + StoreVT.getVectorNumElements()); + SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + return DAG.UnrollVectorOp(ZExt.getNode()); } - // No change for f16 and legal vector D16 types. - return VData; + + if (isTypeLegal(StoreVT)) + return VData; + + // If target supports packed vmem, we just need to workaround + // the illegal type by casting to an equivalent one. + EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); + return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); } SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, @@ -5207,46 +5174,48 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_image_store: case Intrinsic::amdgcn_image_store_mip: { SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); - if (IsD16) + if ((Subtarget->hasUnpackedD16VMem() && + VData.getValueType() == MVT::v2f16) || + VData.getValueType() == MVT::v4f16) { + SDValue Chain = Op.getOperand(0); + VData = handleD16VData(VData, DAG); - SDValue Ops[] = { - Chain, // Chain - VData, // vdata - Op.getOperand(3), // vaddr - Op.getOperand(4), // rsrc - Op.getOperand(5), // dmask - Op.getOperand(6), // glc - Op.getOperand(7), // slc - Op.getOperand(8), // lwe - Op.getOperand(9) // da - }; - unsigned Opc = (IntrinsicID==Intrinsic::amdgcn_image_store) ? - AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; - MemSDNode *M = cast<MemSDNode>(Op); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, - M->getMemoryVT(), M->getMemOperand()); - } + SDValue Ops[] = { + Chain, // Chain + VData, // vdata + Op.getOperand(3), // vaddr + Op.getOperand(4), // rsrc + Op.getOperand(5), // dmask + Op.getOperand(6), // glc + Op.getOperand(7), // slc + Op.getOperand(8), // lwe + Op.getOperand(9) // da + }; + unsigned Opc = (IntrinsicID == Intrinsic::amdgcn_image_store) ? + AMDGPUISD::IMAGE_STORE : AMDGPUISD::IMAGE_STORE_MIP; + MemSDNode *M = cast<MemSDNode>(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + return SDValue(); + } default: { const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrinsicID); if (D16ImageDimIntr) { SDValue VData = Op.getOperand(2); EVT StoreVT = VData.getValueType(); - if ((StoreVT == MVT::v2f16 && !isTypeLegal(StoreVT)) || - StoreVT == MVT::v4f16) { - VData = handleD16VData(VData, DAG); + if (((StoreVT == MVT::v2f16 || StoreVT == MVT::v4f16) && + Subtarget->hasUnpackedD16VMem()) || + !isTypeLegal(StoreVT)) { + SmallVector<SDValue, 12> Ops(Op.getNode()->op_values()); - SmallVector<SDValue, 12> Ops; - for (auto Value : Op.getNode()->op_values()) - Ops.push_back(Value); Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); - Ops[2] = VData; + Ops[2] = handleD16VData(VData, DAG); MemSDNode *M = cast<MemSDNode>(Op); return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Op->getVTList(), diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index ae8b19a46fc..ffb644fafbe 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -60,8 +60,10 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerIntrinsicWChain_IllegalReturnType(SDValue Op, SDValue &Chain, - SelectionDAG &DAG) const; + SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, + SelectionDAG &DAG, + bool IsIntrinsic = false) const; + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; /// Converts \p Op, which must be of floating point type, to the diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 8080151d6d9..363b0e71223 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -871,11 +871,13 @@ def : ClampPat<V_MAX_F32_e64, f32>; def : ClampPat<V_MAX_F64, f64>; def : ClampPat<V_MAX_F16_e64, f16>; +let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), (V_PK_MAX_F16 $src0_modifiers, $src0, $src0_modifiers, $src0, DSTCLAMP.ENABLE) >; +} /********** ================================ **********/ /********** Floating point absolute/negative **********/ @@ -1333,11 +1335,13 @@ def : GCNPat< (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) >; +let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE) >; } +} let OtherPredicates = [NoFP32Denormals] in { def : GCNPat< @@ -1387,11 +1391,6 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa def : ExpPattern<AMDGPUexport, i32, EXP>; def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>; -def : GCNPat < - (v2i16 (build_vector i16:$src0, i16:$src1)), - (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) ->; - // COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < @@ -1399,6 +1398,13 @@ def : GCNPat < (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0)) >; + +let SubtargetPredicate = HasVOP3PInsts in { +def : GCNPat < + (v2i16 (build_vector i16:$src0, i16:$src1)), + (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. def : GCNPat < @@ -1406,6 +1412,7 @@ def : GCNPat < (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1)) >; + def : GCNPat < (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))), (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), @@ -1418,6 +1425,9 @@ def : GCNPat < (v2f16 (S_PACK_LL_B32_B16 $src0, $src1)) >; +} // End SubtargetPredicate = HasVOP3PInsts + + // def : GCNPat < // (v2f16 (scalar_to_vector f16:$src0)), // (COPY $src0) |