diff options
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 2 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 27 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/BUFInstructions.td | 51 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/DSInstructions.td | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/FLATInstructions.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 76 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 105 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 11 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SOPInstructions.td | 37 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VIInstructions.td | 4 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP1Instructions.td | 47 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP2Instructions.td | 72 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3Instructions.td | 32 |
15 files changed, 409 insertions, 78 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index e7d6ef3fd81..7a208d7c09a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -493,6 +493,8 @@ def isCIVI : Predicate < def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; +def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">; + class PredicateControl { Predicate SubtargetPredicate; Predicate SIAssemblerPredicate = isSICI; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 2e43d427e47..5a871489acd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -587,19 +587,32 @@ bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. - return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); + + unsigned SrcSize = Source.getSizeInBits(); + unsigned DestSize = Dest.getSizeInBits(); + + return DestSize < SrcSize && DestSize % 32 == 0 ; } bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { // Truncate is just accessing a subregister. - return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && - (Dest->getPrimitiveSizeInBits() % 32 == 0); + + unsigned SrcSize = Source->getScalarSizeInBits(); + unsigned DestSize = Dest->getScalarSizeInBits(); + + if (DestSize== 16 && Subtarget->has16BitInsts()) + return SrcSize >= 32; + + return DestSize < SrcSize && DestSize % 32 == 0; } bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { unsigned SrcSize = Src->getScalarSizeInBits(); unsigned DestSize = Dest->getScalarSizeInBits(); + if (SrcSize == 16 && Subtarget->has16BitInsts()) + return DestSize >= 32; + return SrcSize == 32 && DestSize == 64; } @@ -608,6 +621,10 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { // practical purposes, the extra mov 0 to load a 64-bit is free. As used, // this will enable reducing 64-bit operations the 32-bit, which is always // good. + + if (Src == MVT::i16) + return Dest == MVT::i32 ||Dest == MVT::i64 ; + return Src == MVT::i32 && Dest == MVT::i64; } @@ -2447,6 +2464,10 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, if (VT.isVector() || Size > 64) return SDValue(); + // There are i16 integer mul/mad. + if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index cc9cce5468a..c2544c295e3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -529,14 +529,14 @@ multiclass BFIPatterns <Instruction BFI_INT, def : Pat < (fcopysign f32:$src0, f32:$src1), - (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1) + (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1) >; def : Pat < (f64 (fcopysign f64:$src0, f64:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, - (BFI_INT (LoadImm32 0x7fffffff), + (BFI_INT (LoadImm32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) >; @@ -545,7 +545,7 @@ multiclass BFIPatterns <Instruction BFI_INT, (f64 (fcopysign f64:$src0, f32:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, - (BFI_INT (LoadImm32 0x7fffffff), + (BFI_INT (LoadImm32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), $src1), sub1) >; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 42d16a53284..928b5d2d5d3 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -708,13 +708,13 @@ let Predicates = [isGCN] in { // int_SI_vs_load_input def : Pat< (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr), - (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) + (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0) >; // Offset in an 32-bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) + (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0) >; @@ -914,7 +914,7 @@ def : Pat< >; -class MUBUFLoad_Pattern <MUBUF_Pseudo Instr_ADDR64, ValueType vt, +class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, PatFrag constant_ld> : Pat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), @@ -936,15 +936,34 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins } let Predicates = [isSICI] in { -def : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; -def : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; -def : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; -def : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; +def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>; defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>; } // End Predicates = [isSICI] +multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, + PatFrag ld> { + + def : Pat < + (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), + (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; +} + +let Predicates = [Has16BitInsts] in { + +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>; +defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>; + +} // End Predicates = [Has16BitInsts] + class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat < (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), @@ -953,6 +972,8 @@ class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat < def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>; +def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>; def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>; @@ -1025,6 +1046,20 @@ defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_ defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>; } // End Predicates = [isSICI] + +multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, + PatFrag st> { + + def : Pat < + (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe)), + (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe) + >; +} + +defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>; +defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>; + class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), @@ -1033,6 +1068,8 @@ class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>; def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>; +def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>; def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 54935bbde7f..a077001df6b 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -489,8 +489,12 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>; def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>; +def : DSReadPat <DS_READ_I8, i16, si_sextload_local_i8>; +def : DSReadPat <DS_READ_U8, i16, si_az_extload_local_i8>; +def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>; +def : DSReadPat <DS_READ_U16, i16, si_load_local>; def : DSReadPat <DS_READ_B32, i32, si_load_local>; let AddedComplexity = 100 in { @@ -512,6 +516,8 @@ class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>; def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>; +def : DSWritePat <DS_WRITE_B8, i16, si_truncstore_local_i8>; +def : DSWritePat <DS_WRITE_B16, i16, si_store_local>; def : DSWritePat <DS_WRITE_B32, i32, si_store_local>; let AddedComplexity = 100 in { @@ -522,8 +528,8 @@ def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>; def : Pat < (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), - (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), - (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, + (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)), + (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1, (i1 0)) >; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 7b54c61dc21..4a86b1e0b54 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -341,6 +341,8 @@ let Predicates = [isCIVI] in { def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>; def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i16>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i16>; def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>; def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>; def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>; @@ -389,6 +391,10 @@ def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; } // End Predicates = [isCIVI] +let Predicates = [isVI] in { + def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i16>; + def : FlatStorePat <FLAT_STORE_SHORT, flat_store, i16>; +} //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3b84e386341..ac13bd2b07e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -78,6 +78,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + if (Subtarget->has16BitInsts()) + addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + computeRegisterProperties(STI.getRegisterInfo()); // We need to custom lower vector stores from local memory @@ -221,6 +224,55 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::Constant, MVT::i16, Legal); + + setOperationAction(ISD::SMIN, MVT::i16, Legal); + setOperationAction(ISD::SMAX, MVT::i16, Legal); + + setOperationAction(ISD::UMIN, MVT::i16, Legal); + setOperationAction(ISD::UMAX, MVT::i16, Legal); + + setOperationAction(ISD::SETCC, MVT::i16, Promote); + AddPromotedToType(ISD::SETCC, MVT::i16, MVT::i32); + + setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); + AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); + + setOperationAction(ISD::ROTR, MVT::i16, Promote); + setOperationAction(ISD::ROTL, MVT::i16, Promote); + + setOperationAction(ISD::SDIV, MVT::i16, Promote); + setOperationAction(ISD::UDIV, MVT::i16, Promote); + setOperationAction(ISD::SREM, MVT::i16, Promote); + setOperationAction(ISD::UREM, MVT::i16, Promote); + + setOperationAction(ISD::BSWAP, MVT::i16, Promote); + setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); + + setOperationAction(ISD::CTTZ, MVT::i16, Promote); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); + setOperationAction(ISD::CTLZ, MVT::i16, Promote); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); + + setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); + + setOperationAction(ISD::BR_CC, MVT::i16, Expand); + + setOperationAction(ISD::LOAD, MVT::i16, Custom); + + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); + AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32); + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); + AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32); + setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote); + AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32); + setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); + AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); + } + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -2558,7 +2610,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemVT = Load->getMemoryVT(); if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { - assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); // FIXME: Copied from PPC // First, load into 32 bits, then truncate to 1 bit. @@ -2566,8 +2617,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue BasePtr = Load->getBasePtr(); MachineMemOperand *MMO = Load->getMemOperand(); + EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16; + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, - BasePtr, MVT::i8, MMO); + BasePtr, RealMemVT, MMO); SDValue Ops[] = { DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), @@ -3381,8 +3434,23 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, } EVT VT = K0->getValueType(0); - return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + + MVT NVT = MVT::i32; + unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + + SDValue Tmp1, Tmp2, Tmp3; + Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); + + if (VT == MVT::i16) { + Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, + Tmp1, Tmp2, Tmp3); + + return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); + } else + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); } static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index f19e99e7cd1..d770bd425c4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1128,7 +1128,6 @@ def getAtomicNoRetOp : InstrMapping { include "SIInstructions.td" include "CIInstructions.td" -include "VIInstructions.td" include "DSInstructions.td" include "MIMGInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 4122eb915f3..b758a576047 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -374,7 +374,7 @@ def : Pat< def : Pat < (int_AMDGPU_kilp), - (SI_KILL 0xbf800000) + (SI_KILL (i32 0xbf800000)) >; def : Pat < @@ -555,7 +555,7 @@ def : BitConvert <v16f32, v16i32, VReg_512>; def : Pat < (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod) + (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod) >; /********** ================================ **********/ @@ -566,7 +566,7 @@ def : Pat < def : Pat < (fneg (fabs f32:$src)), - (S_OR_B32 $src, (S_MOV_B32 0x80000000)) // Set sign bit + (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit >; // FIXME: Should use S_OR_B32 @@ -575,19 +575,19 @@ def : Pat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), // Set sign bit. + (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. sub1) >; def : Pat < (fabs f32:$src), - (V_AND_B32_e64 $src, (V_MOV_B32_e32 0x7fffffff)) + (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff))) >; def : Pat < (fneg f32:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) + (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000))) >; def : Pat < @@ -595,8 +595,8 @@ def : Pat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_AND_B32_e64 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x7fffffff)), // Set sign bit. + (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit. sub1) >; @@ -605,8 +605,8 @@ def : Pat < (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), sub0, - (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1), - (V_MOV_B32_e32 0x80000000)), + (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)), + (i32 (V_MOV_B32_e32 (i32 0x80000000)))), sub1) >; @@ -666,21 +666,21 @@ def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, - (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2), + (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub0, - (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2), + (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub1, - (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub2, - (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1), - 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2), + (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)), + 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)), 0 /* clamp */, 0 /* omod */), sub3) >; @@ -701,7 +701,7 @@ def : Ext32Pat <anyext>; def : Pat < (AMDGPUurecip i32:$src0), (V_CVT_U32_F32_e32 - (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, + (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1), (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; @@ -767,32 +767,37 @@ def : Pat < //===----------------------------------------------------------------------===// def : Pat<(i32 (sext_inreg i32:$src, i1)), - (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16 + (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 // Handle sext_inreg in i64 def : Pat < (i64 (sext_inreg i64:$src, i1)), - (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16 + (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 +>; + +def : Pat < + (i16 (sext_inreg i16:$src, i8)), + (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i8)), - (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16 + (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i16)), - (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16 + (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 >; def : Pat < (i64 (sext_inreg i64:$src, i32)), - (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16 + (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 >; def : Pat < (i64 (zext i32:$src)), - (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1) + (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) >; def : Pat < @@ -804,7 +809,7 @@ class ZExt_i64_i1_Pat <SDNode ext> : Pat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, - (S_MOV_B32 0), sub1) + (S_MOV_B32 (i32 0)), sub1) >; @@ -816,25 +821,25 @@ def : ZExt_i64_i1_Pat<anyext>; def : Pat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, - (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1) + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) >; def : Pat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 0, -1, $src), sub0, - (V_CNDMASK_B32_e64 0, -1, $src), sub1) + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0, + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1) >; -class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat < +class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), - (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) + (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) >; -def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>; -def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>; -def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>; -def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>; +def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; +def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; +def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; +def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; // If we need to perform a logical operation on i1 values, we need to // use vector comparisons since there is only one SCC register. Vector @@ -859,12 +864,12 @@ def : Pat < def : Pat < (f32 (sint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src) + (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src) >; def : Pat < (f32 (uint_to_fp i1:$src)), - (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src) + (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src) >; def : Pat < @@ -888,20 +893,20 @@ def : Pat < def : Pat < (i1 (trunc i32:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), 1) + (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) >; def : Pat < (i1 (trunc i64:$a)), (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), - (EXTRACT_SUBREG $a, sub0)), 1) + (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; def : Pat < (i32 (bswap i32:$a)), - (V_BFI_B32 (S_MOV_B32 0x00ff00ff), - (V_ALIGNBIT_B32 $a, $a, 24), - (V_ALIGNBIT_B32 $a, $a, 8)) + (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), + (V_ALIGNBIT_B32 $a, $a, (i32 24)), + (V_ALIGNBIT_B32 $a, $a, (i32 8))) >; def : Pat < @@ -917,7 +922,7 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { def : Pat < (vt (add (vt (shl 1, vt:$a)), -1)), - (BFM $a, (MOV 0)) + (BFM $a, (MOV (i32 0))) >; } @@ -928,7 +933,7 @@ def : BFEPattern <V_BFE_U32, S_MOV_B32>; def : Pat< (fcanonicalize f32:$src), - (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0) + (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0) >; def : Pat< @@ -963,7 +968,7 @@ def : Pat < (V_MOV_B64_PSEUDO 0x3fefffffffffffff), DSTCLAMP.NONE, DSTOMOD.NONE), $x, - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)), + (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))), DSTCLAMP.NONE, DSTOMOD.NONE) >; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 7d3634ef2d1..a5ba0ef7e0e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -123,7 +123,7 @@ def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers -def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add (sequence "SGPR%u", 0, 103))> { let AllocationPriority = 1; } @@ -190,7 +190,8 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], (add (decimate (shl TTMP_32, 3), 4))]>; // VGPR 32-bit registers -def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, +// i16 only on VI+ +def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add (sequence "VGPR%u", 0, 255))> { let AllocationPriority = 1; let Size = 32; @@ -258,8 +259,8 @@ def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32, } // Register class for all scalar registers (SGPRs + Special Registers) -def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SReg_32_XM0, M0)> { +def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, + (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> { let AllocationPriority = 1; } @@ -346,7 +347,7 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { let Size = 32; } -def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)> { +def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> { let isAllocatable = 0; } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index e38a11db9ac..2486fbf3edf 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -879,7 +879,7 @@ def : Pat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, - (S_MOV_B32 0), sub1)) + (S_MOV_B32 (i32 0)), sub1)) >; def : Pat < @@ -887,6 +887,18 @@ def : Pat < (S_ABS_I32 $x) >; +def : Pat < + (i16 imm:$imm), + (S_MOV_B32 imm:$imm) +>; + +// Same as a 32-bit inreg +def : Pat< + (i32 (sext i16:$src)), + (S_SEXT_I32_I16 $src) +>; + + //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// @@ -898,6 +910,29 @@ def : Pat < (S_ADD_U32 $src0, $src1) >; +// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that +// REG_SEQUENCE patterns don't support instructions with multiple +// outputs. +def : Pat< + (i64 (zext i16:$src)), + (REG_SEQUENCE SReg_64, + (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; + +def : Pat < + (i64 (sext i16:$src)), + (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0, + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1) +>; + +def : Pat< + (i32 (zext i16:$src)), + (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src) +>; + + + //===----------------------------------------------------------------------===// // SOPP Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VIInstructions.td b/llvm/lib/Target/AMDGPU/VIInstructions.td index ead90ece0ad..b45c8fc9c7d 100644 --- a/llvm/lib/Target/AMDGPU/VIInstructions.td +++ b/llvm/lib/Target/AMDGPU/VIInstructions.td @@ -8,3 +8,7 @@ //===----------------------------------------------------------------------===// // Instruction definitions for VI and newer. //===----------------------------------------------------------------------===// + +FIXME: Deleting this file broke buildbots that don't do full rebuilds. This +file is no longer used by the backend, so it can be deleted once all +the buildbots update there dependencies. diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 6124d4e05da..b2840982462 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -301,6 +301,20 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16>; } +let Predicates = [isVI] in { + +def : Pat< + (f32 (f16_to_fp i16:$src)), + (V_CVT_F32_F16_e32 $src) +>; + +def : Pat< + (i16 (fp_to_f16 f32:$src)), + (V_CVT_F16_F32_e32 $src) +>; + +} + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// @@ -561,10 +575,39 @@ def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>; let Predicates = [isVI] in { def : Pat < - (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, - imm:$bound_ctrl), + (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, + imm:$bound_ctrl)), (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) >; + +def : Pat< + (i32 (anyext i16:$src)), + (COPY $src) +>; + +def : Pat< + (i64 (anyext i16:$src)), + (REG_SEQUENCE VReg_64, + (i32 (COPY $src)), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; + +def : Pat< + (i16 (trunc i32:$src)), + (COPY $src) +>; + +def : Pat< + (i1 (trunc i16:$src)), + (COPY $src) +>; + + +def : Pat < + (i16 (trunc i64:$src)), + (EXTRACT_SUBREG $src, sub0) +>; + } // End Predicates = [isVI] diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index fc13382926d..570ca05587b 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -345,6 +345,78 @@ defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>; } // End SubtargetPredicate = isVI +// Note: 16-bit instructions produce a 0 result in the high 16-bits. +multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { + +def : Pat< + (op i16:$src0, i16:$src1), + (inst $src0, $src1) +>; + +def : Pat< + (i32 (zext (op i16:$src0, i16:$src1))), + (inst $src0, $src1) +>; + +def : Pat< + (i64 (zext (op i16:$src0, i16:$src1))), + (REG_SEQUENCE VReg_64, + (inst $src0, $src1), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; + +} + +multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> { + +def : Pat< + (op i16:$src0, i32:$src1), + (inst $src1, $src0) +>; + +def : Pat< + (i32 (zext (op i16:$src0, i32:$src1))), + (inst $src1, $src0) +>; + + +def : Pat< + (i64 (zext (op i16:$src0, i32:$src1))), + (REG_SEQUENCE VReg_64, + (inst $src1, $src0), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; +} + +class ZExt_i16_i1_Pat <SDNode ext> : Pat < + (i16 (ext i1:$src)), + (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) +>; + +let Predicates = [isVI] in { + +defm : Arithmetic_i16_Pats<add, V_ADD_U16_e32>; +defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e32>; +defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e32>; +defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e32>; +defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e32>; +defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e32>; +defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e32>; + +defm : Arithmetic_i16_Pats<and, V_AND_B32_e32>; +defm : Arithmetic_i16_Pats<or, V_OR_B32_e32>; +defm : Arithmetic_i16_Pats<xor, V_XOR_B32_e32>; + +defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e32>; +defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e32>; +defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_B16_e32>; + +def : ZExt_i16_i1_Pat<zext>; +def : ZExt_i16_i1_Pat<sext>; +def : ZExt_i16_i1_Pat<anyext>; + +} // End Predicates = [isVI] + //===----------------------------------------------------------------------===// // SI //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 0f063756de5..73e331503ad 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -222,6 +222,38 @@ let isCommutable = 1 in { } // End SubtargetPredicate = isVI +def : Pat < + (i16 (select i1:$src0, i16:$src1, i16:$src2)), + (V_CNDMASK_B32_e64 $src2, $src1, $src0) +>; + +let Predicates = [isVI] in { + +multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, + Instruction inst, SDPatternOperator op3> { +def : Pat< + (op2 (op1 i16:$src0, i16:$src1), i16:$src2), + (inst i16:$src0, i16:$src1, i16:$src2) +>; + +def : Pat< + (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), + (inst i16:$src0, i16:$src1, i16:$src2) +>; + +def : Pat< + (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), + (REG_SEQUENCE VReg_64, + (inst i16:$src0, i16:$src1, i16:$src2), sub0, + (V_MOV_B32_e32 (i32 0)), sub1) +>; +} + +defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>; +defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>; + +} // End Predicates = [isVI] + //===----------------------------------------------------------------------===// // Target |