diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-06-15 15:15:46 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-06-15 15:15:46 +0000 |
commit | 02dc7e19e2910b2b3a963105684af0f830885360 (patch) | |
tree | 4fb36d460658188a9eaac6e5b0d1abe8502a39db /llvm/lib/Target/AMDGPU | |
parent | fa5597b24da47d5ecec4560f3f76f4bb08b405bc (diff) | |
download | bcm5719-llvm-02dc7e19e2910b2b3a963105684af0f830885360.tar.gz bcm5719-llvm-02dc7e19e2910b2b3a963105684af0f830885360.zip |
AMDGPU: Make v4i16/v4f16 legal
Some image loads return these, and it's awkward working
around them not being legal.
llvm-svn: 334835
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 18 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/BUFInstructions.td | 12 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/MIMGInstructions.td | 33 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 206 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 41 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 10 |
8 files changed, 235 insertions, 92 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index c1c066fd140..79a64c6abb3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -127,7 +127,7 @@ def CC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, @@ -144,7 +144,7 @@ def RetCC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">> + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">> ]>; def CC_AMDGPU : CallingConv<[ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 19106a5ae8d..8685de871de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -73,7 +73,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::i64: case MVT::f64: case MVT::v2i32: - case MVT::v2f32: { + case MVT::v2f32: + case MVT::v4i16: + case MVT::v4f16: { // Up to SGPR0-SGPR39 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, &AMDGPU::SGPR_64RegClass, 20); @@ -94,7 +96,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::i64: case MVT::f64: case MVT::v2i32: - case MVT::v2f32: { + case MVT::v2f32: + case MVT::v4i16: + case MVT::v4f16: { return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, &AMDGPU::VReg_64RegClass, 31); } @@ -1234,6 +1238,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> Args; + EVT VT = Op.getValueType(); + if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SDLoc SL(Op); + SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0)); + SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1)); + + SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi }); + return DAG.getNode(ISD::BITCAST, SL, VT, BV); + } + for (const SDUse &U : Op->ops()) DAG.ExtractVectorElements(U.get(), Args); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 7c0b5e9d8f6..18e28b90e06 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1084,8 +1084,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">; - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_XY">; - defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XYZW">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">; @@ -1145,8 +1144,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">; - defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_XY">; - defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XYZW">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">; @@ -1571,8 +1569,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">; - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_XY">; - defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XYZW">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, @@ -1633,8 +1630,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in { let SubtargetPredicate = HasPackedD16VMem in { defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">; - defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_XY">; - defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XYZW">; + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index b63c120b348..702660c00b5 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -594,12 +594,6 @@ foreach intr = !listconcat(AMDGPUImageDimIntrinsics, def intr#_pat_v4 : ImageDimPattern<intr, "_V4", v4f32>; } -// v2f16 and v4f16 are used as data types to signal that D16 should be used. -// However, they are not (always) legal types, and the SelectionDAG requires us -// to legalize them before running any patterns. So we legalize them by -// converting to an int type of equal size and using an internal 'd16helper' -// intrinsic instead which signifies both the use of D16 and actually allows -// this integer-based return type. multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I, AMDGPUImageDimIntrinsic d16helper> { let SubtargetPredicate = HasUnpackedD16VMem in { @@ -611,7 +605,7 @@ multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I, let SubtargetPredicate = HasPackedD16VMem in { def _packed_v1 : ImageDimPattern<I, "_V1", f16, "_D16">; def _packed_v2 : ImageDimPattern<I, "_V1", v2f16, "_D16">; - def _packed_v4 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16">; + def _packed_v4 : ImageDimPattern<I, "_V2", v4f16, "_D16">; } // End HasPackedD16VMem. } @@ -653,10 +647,7 @@ foreach intr = AMDGPUImageDimGatherIntrinsics in { } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { - def intr#_packed_v4 : - ImageDimPattern<!cast<AMDGPUImageDimIntrinsic>( - "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name), - "_V2", v2i32, "_D16">; + def intr#_packed_v4 : ImageDimPattern<intr, "_V2", v4f16, "_D16">; } // End HasPackedD16VMem. } @@ -703,6 +694,7 @@ multiclass ImageSamplePatterns<SDPatternOperator name, string opcode> { let SubtargetPredicate = HasPackedD16VMem in { defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">; defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), v2f16, "_D16">; + defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">; } // End HasPackedD16VMem. } @@ -712,16 +704,15 @@ multiclass ImageSampleAltPatterns<SDPatternOperator name, string opcode> { defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16_gfx80">; defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V4), v4i32, "_D16_gfx80">; } // End HasUnpackedD16VMem. - - let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">; - defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">; - } // End HasPackedD16VMem. } // ImageGather4 patterns. multiclass ImageGather4Patterns<SDPatternOperator name, string opcode> { defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>; + + let SubtargetPredicate = HasPackedD16VMem in { + defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">; + } // End HasPackedD16VMem. } // ImageGather4 alternative patterns for illegal vector half Types. @@ -730,9 +721,6 @@ multiclass ImageGather4AltPatterns<SDPatternOperator name, string opcode> { defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V4), v4i32, "_D16_gfx80">; } // End HasUnpackedD16VMem. - let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">; - } // End HasPackedD16VMem. } // ImageLoad for amdgcn. @@ -766,6 +754,7 @@ multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> { let SubtargetPredicate = HasPackedD16VMem in { defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">; defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), v2f16, "_D16">; + defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">; } // End HasPackedD16VMem. } @@ -775,11 +764,6 @@ multiclass ImageLoadAltPatterns<SDPatternOperator name, string opcode> { defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16_gfx80">; defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4i32, "_D16_gfx80">; } // End HasUnPackedD16VMem. - - let SubtargetPredicate = HasPackedD16VMem in { - defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">; - defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">; - } // End HasPackedD16VMem. } // ImageStore for amdgcn. @@ -813,6 +797,7 @@ multiclass ImageStorePatterns<SDPatternOperator name, string opcode> { let SubtargetPredicate = HasPackedD16VMem in { defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">; defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), v2f16, "_D16">; + defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">; } // End HasPackedD16VMem. } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index df396e6ad1f..1b91d743641 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -143,6 +143,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } computeRegisterProperties(STI.getRegisterInfo()); @@ -237,7 +239,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64}) { + MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -260,6 +262,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } } + setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand); + // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that // is expanded to avoid having two separate loops in case the index is a VGPR. @@ -426,7 +430,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (!Subtarget->hasFP16Denormals()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - for (MVT VT : {MVT::v2i16, MVT::v2f16}) { + for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -488,6 +492,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand); + if (!Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -520,8 +528,31 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + + setOperationAction(ISD::SHL, MVT::v4i16, Custom); + setOperationAction(ISD::SRA, MVT::v4i16, Custom); + setOperationAction(ISD::SRL, MVT::v4i16, Custom); + setOperationAction(ISD::ADD, MVT::v4i16, Custom); + setOperationAction(ISD::SUB, MVT::v4i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i16, Custom); + + setOperationAction(ISD::SMIN, MVT::v4i16, Custom); + setOperationAction(ISD::SMAX, MVT::v4i16, Custom); + setOperationAction(ISD::UMIN, MVT::v4i16, Custom); + setOperationAction(ISD::UMAX, MVT::v4i16, Custom); + + setOperationAction(ISD::FADD, MVT::v4f16, Custom); + setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); + setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); + + setOperationAction(ISD::SELECT, MVT::v4i16, Custom); + setOperationAction(ISD::SELECT, MVT::v4f16, Custom); } + setOperationAction(ISD::FNEG, MVT::v4f16, Custom); + setOperationAction(ISD::FABS, MVT::v4f16, Custom); + if (Subtarget->has16BitInsts()) { setOperationAction(ISD::SELECT, MVT::v2i16, Promote); AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); @@ -3383,6 +3414,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// +// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the +// wider vector type is legal. +SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4f16); + + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); + + SDLoc SL(Op); + SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + +// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the +// wider vector type is legal. +SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4i16 || VT == MVT::v4f16); + + SDValue Lo0, Hi0; + std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Lo1, Hi1; + std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); + + SDLoc SL(Op); + + SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -3423,6 +3497,24 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerTRAP(Op, DAG); case ISD::DEBUGTRAP: return lowerDEBUGTRAP(Op, DAG); + case ISD::FABS: + case ISD::FNEG: + return splitUnaryVectorOp(Op, DAG); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FADD: + case ISD::FMUL: + return splitBinaryVectorOp(Op, DAG); } return SDValue(); } @@ -3630,21 +3722,23 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, bool Unpacked = Subtarget->hasUnpackedD16VMem(); EVT LoadVT = M->getValueType(0); - EVT UnpackedLoadVT = LoadVT.isVector() ? - EVT::getVectorVT(*DAG.getContext(), MVT::i32, - LoadVT.getVectorNumElements()) : LoadVT; EVT EquivLoadVT = LoadVT; - if (LoadVT.isVector()) { - EquivLoadVT = Unpacked ? UnpackedLoadVT : - getEquivalentMemType(*DAG.getContext(), LoadVT); + if (Unpacked && LoadVT.isVector()) { + EquivLoadVT = LoadVT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()) : LoadVT; } // Change from v4f16/v2f16 to EquivLoadVT. SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); - SDValue Load = DAG.getMemIntrinsicNode( - IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, - DL, VTList, Ops, M->getMemoryVT(), M->getMemOperand()); + SDValue Load + = DAG.getMemIntrinsicNode( + IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + if (!Unpacked) // Just adjusted the opcode. + return Load; SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); @@ -3734,8 +3828,10 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FNEG: { + if (N->getValueType(0) != MVT::v2f16) + break; + SDLoc SL(N); - assert(N->getValueType(0) == MVT::v2f16); SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, @@ -3745,8 +3841,10 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FABS: { + if (N->getValueType(0) != MVT::v2f16) + break; + SDLoc SL(N); - assert(N->getValueType(0) == MVT::v2f16); SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, @@ -4247,6 +4345,23 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SDLoc SL(Op); EVT VT = Op.getValueType(); + if (VT == MVT::v4i16 || VT == MVT::v4f16) { + EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + + // Turn into pair of packed build_vectors. + // TODO: Special case for constants that can be materialized with s_mov_b64. + SDValue Lo = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(0), Op.getOperand(1) }); + SDValue Hi = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(2), Op.getOperand(3) }); + + SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); + SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); + + SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + } + assert(VT == MVT::v2f16 || VT == MVT::v2i16); SDValue Lo = Op.getOperand(0); @@ -4913,11 +5028,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_image_load: case Intrinsic::amdgcn_image_load_mip: { - EVT LoadVT = Op.getValueType(); - if ((Subtarget->hasUnpackedD16VMem() && LoadVT == MVT::v2f16) || - LoadVT == MVT::v4f16) { - MemSDNode *M = cast<MemSDNode>(Op); - return adjustLoadValueType(getImageOpcode(IntrID), M, DAG); + EVT VT = Op.getValueType(); + if (Subtarget->hasUnpackedD16VMem() && + VT.isVector() && VT.getScalarSizeInBits() == 16) { + return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op), + DAG); } return SDValue(); @@ -5009,8 +5124,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); } - if ((Subtarget->hasUnpackedD16VMem() && Op.getValueType() == MVT::v2f16) || - Op.getValueType() == MVT::v4f16) { + if (Subtarget->hasUnpackedD16VMem() && + Op.getValueType().isVector() && + Op.getValueType().getScalarSizeInBits() == 16) { return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op), DAG); } @@ -5018,21 +5134,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return SDValue(); } default: - EVT LoadVT = Op.getValueType(); - if (LoadVT.getScalarSizeInBits() != 16) - return SDValue(); - - const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = - AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID); - if (D16ImageDimIntr) { - bool Unpacked = Subtarget->hasUnpackedD16VMem(); - MemSDNode *M = cast<MemSDNode>(Op); - - if (isTypeLegal(LoadVT) && (!Unpacked || LoadVT == MVT::f16)) - return SDValue(); - - return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr, - M, DAG, true); + if (Subtarget->hasUnpackedD16VMem() && + Op.getValueType().isVector() && + Op.getValueType().getScalarSizeInBits() == 16) { + if (const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr = + AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID)) { + return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr, + cast<MemSDNode>(Op), DAG, true); + } } return SDValue(); @@ -5061,13 +5170,8 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, return DAG.UnrollVectorOp(ZExt.getNode()); } - if (isTypeLegal(StoreVT)) - return VData; - - // If target supports packed vmem, we just need to workaround - // the illegal type by casting to an equivalent one. - EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT); - return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData); + assert(isTypeLegal(StoreVT)); + return VData; } SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, @@ -5261,9 +5365,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_image_store: case Intrinsic::amdgcn_image_store_mip: { SDValue VData = Op.getOperand(2); - if ((Subtarget->hasUnpackedD16VMem() && - VData.getValueType() == MVT::v2f16) || - VData.getValueType() == MVT::v4f16) { + EVT VT = VData.getValueType(); + if (Subtarget->hasUnpackedD16VMem() && + VT.isVector() && VT.getScalarSizeInBits() == 16) { SDValue Chain = Op.getOperand(0); VData = handleD16VData(VData, DAG); @@ -5293,9 +5397,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, if (D16ImageDimIntr) { SDValue VData = Op.getOperand(2); EVT StoreVT = VData.getValueType(); - if (((StoreVT == MVT::v2f16 || StoreVT == MVT::v4f16) && - Subtarget->hasUnpackedD16VMem()) || - !isTypeLegal(StoreVT)) { + if (Subtarget->hasUnpackedD16VMem() && + StoreVT.isVector() && + StoreVT.getScalarSizeInBits() == 16) { SmallVector<SDValue, 12> Ops(Op.getNode()->op_values()); Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32); @@ -5521,8 +5625,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() != MVT::i64) - return SDValue(); + EVT VT = Op.getValueType(); + assert(VT.getSizeInBits() == 64); SDLoc DL(Op); SDValue Cond = Op.getOperand(0); @@ -5544,7 +5648,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); - return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); + return DAG.getNode(ISD::BITCAST, DL, VT, Res); } // Catch division cases where we can use shortcuts with rcp and rsq diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index db5a1dc9641..b4546811d3e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -268,7 +268,10 @@ public: EVT VT) const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const; + SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 31c606123e2..1a7e147d1c5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -795,6 +795,27 @@ foreach Index = 0-15 in { >; } + +def : Pat < + (extract_subvector v4i16:$vec, (i32 0)), + (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0)) +>; + +def : Pat < + (extract_subvector v4i16:$vec, (i32 2)), + (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1)) +>; + +def : Pat < + (extract_subvector v4f16:$vec, (i32 0)), + (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0)) +>; + +def : Pat < + (extract_subvector v4f16:$vec, (i32 2)), + (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) +>; + let SubtargetPredicate = isGCN in { // FIXME: Why do only some of these type combinations for SReg and @@ -834,6 +855,26 @@ def : BitConvert <f64, v2f32, VReg_64>; def : BitConvert <v2f32, f64, VReg_64>; def : BitConvert <f64, v2i32, VReg_64>; def : BitConvert <v2i32, f64, VReg_64>; + +// FIXME: Make SGPR +def : BitConvert <v2i32, v4f16, VReg_64>; +def : BitConvert <v4f16, v2i32, VReg_64>; +def : BitConvert <v2i32, v4f16, VReg_64>; +def : BitConvert <v2i32, v4i16, VReg_64>; +def : BitConvert <v4i16, v2i32, VReg_64>; +def : BitConvert <v2f32, v4f16, VReg_64>; +def : BitConvert <v4f16, v2f32, VReg_64>; +def : BitConvert <v2f32, v4i16, VReg_64>; +def : BitConvert <v4i16, v2f32, VReg_64>; +def : BitConvert <v4i16, f64, VReg_64>; +def : BitConvert <v4f16, f64, VReg_64>; +def : BitConvert <f64, v4i16, VReg_64>; +def : BitConvert <f64, v4f16, VReg_64>; +def : BitConvert <v4i16, i64, VReg_64>; +def : BitConvert <v4f16, i64, VReg_64>; +def : BitConvert <i64, v4i16, VReg_64>; +def : BitConvert <i64, v4f16, VReg_64>; + def : BitConvert <v4i32, v4f32, VReg_128>; def : BitConvert <v4f32, v4i32, VReg_128>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index e894a3da513..f87a0763b35 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -435,22 +435,22 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, let AllocationPriority = 7; } -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 8; } -def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> { +def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { let isAllocatable = 0; } -def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, +def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 8; } -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 8; @@ -505,7 +505,7 @@ def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, } // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> { let Size = 64; // Requires 2 v_mov_b32 to copy |