diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 18 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 69 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 11 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 14 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 75 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 106 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.h | 25 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 20 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 63 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 30 |
11 files changed, 378 insertions, 63 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 23f124b637f..0652dacd9b0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -181,12 +181,20 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { } bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { - if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && - T->getIntegerBitWidth() <= 16) + const IntegerType *IntTy = dyn_cast<IntegerType>(T); + if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) return true; - if (!T->isVectorTy()) - return false; - return needsPromotionToI32(cast<VectorType>(T)->getElementType()); + + if (const VectorType *VT = dyn_cast<VectorType>(T)) { + // TODO: The set of packed operations is more limited, so may want to + // promote some anyway. + if (ST->hasVOP3PInsts()) + return false; + + return needsPromotionToI32(VT->getElementType()); + } + + return false; } // Return true if the op promoted to i32 should have nsw set. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index e02ced04f08..fddf94339a1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -159,6 +159,10 @@ private: SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp) const; + void SelectADD_SUB_I64(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); @@ -305,6 +309,20 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { llvm_unreachable("invalid vector size"); } +static bool getConstantValue(SDValue N, uint32_t &Out) { + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) { + Out = C->getAPIntValue().getZExtValue(); + return true; + } + + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) { + Out = C->getValueAPF().bitcastToAPInt().getZExtValue(); + return true; + } + + return false; +} + void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -356,7 +374,24 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); + + if (VT == MVT::v2i16 || VT == MVT::v2f16) { + if (Opc == ISD::BUILD_VECTOR) { + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + uint32_t K = LHSVal | (RHSVal << 16); + CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT, + CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32)); + return; + } + } + + break; + } + assert(EltVT.bitsEq(MVT::i32)); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { RegClassID = selectSGPRVectorRegClassID(NumVectorElts); } else { @@ -1565,7 +1600,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - Src = In; if (Src.getOpcode() == ISD::FNEG) { @@ -1579,7 +1613,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, } SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); - return true; } @@ -1633,6 +1666,38 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + Src = In; + + // FIXME: Look for on separate components + if (Src.getOpcode() == ISD::FNEG) { + Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI); + Src = Src.getOperand(0); + } + + // Packed instructions do not have abs modifiers. + + // FIXME: Handle abs/neg of individual components. + // FIXME: Handle swizzling with op_sel + Mods |= SISrcMods::OP_SEL_1; + + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp) const { + SDLoc SL(In); + + // FIXME: Handle clamp and op_sel + Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); + + return SelectVOP3PMods(In, Src, SrcMods); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f28afa89bd2..edaab0063da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -644,12 +644,17 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); - return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() && - VT == MVT::f16); + + // Packed operations do not have a fabs modifier. + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16); } bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { - return isFAbsFree(VT); + assert(VT.isFloatingPoint()); + return VT == MVT::f32 || VT == MVT::f64 || + (Subtarget->has16BitInsts() && VT == MVT::f16) || + (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16); } bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index d0c62877524..ba2aed68fb8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -75,6 +75,12 @@ def brtarget : Operand<OtherVT>; // Misc. PatFrags //===----------------------------------------------------------------------===// +class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag< + (ops node:$src0), + (op $src0), + [{ return N->hasOneUse(); }] +>; + class HasOneUseBinOp<SDPatternOperator op> : PatFrag< (ops node:$src0, node:$src1), (op $src0, $src1), @@ -87,6 +93,7 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag< [{ return N->hasOneUse(); }] >; +def trunc_oneuse : HasOneUseUnaryOp<trunc>; let Properties = [SDNPCommutative, SDNPAssociative] in { def smax_oneuse : HasOneUseBinOp<smax>; @@ -101,6 +108,8 @@ def xor_oneuse : HasOneUseBinOp<xor>; } // Properties = [SDNPCommutative, SDNPAssociative] def sub_oneuse : HasOneUseBinOp<sub>; + +def srl_oneuse : HasOneUseBinOp<srl>; def shl_oneuse : HasOneUseBinOp<shl>; def select_oneuse : HasOneUseTernaryOp<select>; @@ -440,6 +449,7 @@ int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP16_ONE = 0x3C00; +int V2FP16_ONE = 0x3C003C00; int FP32_ONE = 0x3f800000; int FP32_NEG_ONE = 0xbf800000; int FP64_ONE = 0x3ff0000000000000; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 73cd96b1180..0a0584d5074 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -367,8 +367,6 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *FoldRC = TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); - APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType), - OpToFold.getImm()); // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { @@ -378,21 +376,25 @@ void SIFoldOperands::foldOperand( MRI->getRegClass(UseReg) : TRI->getPhysRegClass(UseReg); - assert(Imm.getBitWidth() == 64); - if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; + APInt Imm(64, OpToFold.getImm()); if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } + + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + return; } - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + + + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); } static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 109870781f5..0959707ac9a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -402,6 +402,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } } + // XXX - Do these do anything? Vector constants turn into build_vector. + setOperationAction(ISD::Constant, MVT::v2i16, Legal); + setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); + setOperationAction(ISD::STORE, MVT::v2i16, Promote); AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); setOperationAction(ISD::STORE, MVT::v2f16, Promote); @@ -411,6 +415,46 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32); setOperationAction(ISD::LOAD, MVT::v2f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::AND, MVT::v2i16, Promote); + AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32); + setOperationAction(ISD::OR, MVT::v2i16, Promote); + AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); + setOperationAction(ISD::XOR, MVT::v2i16, Promote); + AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2i16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2f16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); + + setOperationAction(ISD::ADD, MVT::v2i16, Legal); + setOperationAction(ISD::SUB, MVT::v2i16, Legal); + setOperationAction(ISD::MUL, MVT::v2i16, Legal); + setOperationAction(ISD::SHL, MVT::v2i16, Legal); + setOperationAction(ISD::SRL, MVT::v2i16, Legal); + setOperationAction(ISD::SRA, MVT::v2i16, Legal); + setOperationAction(ISD::SMIN, MVT::v2i16, Legal); + setOperationAction(ISD::UMIN, MVT::v2i16, Legal); + setOperationAction(ISD::SMAX, MVT::v2i16, Legal); + setOperationAction(ISD::UMAX, MVT::v2i16, Legal); + + setOperationAction(ISD::FADD, MVT::v2f16, Legal); + setOperationAction(ISD::FNEG, MVT::v2f16, Legal); + setOperationAction(ISD::FMUL, MVT::v2f16, Legal); + setOperationAction(ISD::FMA, MVT::v2f16, Legal); + setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); + + // This isn't really legal, but this avoids the legalizer unrolling it (and + // allows matching fneg (fabs x) patterns) + setOperationAction(ISD::FABS, MVT::v2f16, Legal); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); } setTargetDAGCombine(ISD::FADD); @@ -428,6 +472,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::FCANONICALIZE); + setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -3965,7 +4010,7 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, SDValue SITargetLowering::performFCanonicalizeCombine( SDNode *N, DAGCombinerInfo &DCI) const { - ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0)); if (!CFP) return SDValue(); @@ -3975,13 +4020,14 @@ SDValue SITargetLowering::performFCanonicalizeCombine( // Flush denormals to 0 if not enabled. if (C.isDenormal()) { EVT VT = N->getValueType(0); - if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) + EVT SVT = VT.getScalarType(); + if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); - if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) + if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); - if (VT == MVT::f16 && !Subtarget->hasFP16Denormals()) + if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals()) return DAG.getConstantFP(0.0, SDLoc(N), VT); } @@ -4001,7 +4047,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine( return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); } - return SDValue(CFP, 0); + return N->getOperand(0); } static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { @@ -4270,7 +4316,6 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - assert(!VT.isVector()); SDLoc SL(N); SDValue LHS = N->getOperand(0); @@ -4509,6 +4554,24 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMed3Combine(N, DCI); case AMDGPUISD::CVT_PKRTZ_F16_F32: return performCvtPkRTZCombine(N, DCI); + case ISD::SCALAR_TO_VECTOR: { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) + if (VT == MVT::v2i16 || VT == MVT::v2f16) { + SDLoc SL(N); + SDValue Src = N->getOperand(0); + EVT EltVT = Src.getValueType(); + if (EltVT == MVT::f16) + Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src); + + SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); + return DAG.getNode(ISD::BITCAST, SL, VT, Ext); + } + + break; + } } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ead43731809..1833b324915 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1839,17 +1839,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, // would be for any 32-bit integer operand, but would not be for a 64-bit one. int64_t Imm = MO.getImm(); - switch (operandBitWidth(OperandType)) { - case 32: { + switch (OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: { int32_t Trunc = static_cast<int32_t>(Imm); return Trunc == Imm && AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } - case 64: { + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { return AMDGPU::isInlinableLiteral64(MO.getImm(), ST.hasInv2PiInlineImm()); } - case 16: { + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -1862,6 +1871,11 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return false; } + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + uint32_t Trunc = static_cast<uint32_t>(Imm); + return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); + } default: llvm_unreachable("invalid bitwidth"); } @@ -3117,6 +3131,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); + + case AMDGPU::S_PACK_LL_B32_B16: + case AMDGPU::S_PACK_LH_B32_B16: + case AMDGPU::S_PACK_HH_B32_B16: { + movePackToVALU(Worklist, MRI, Inst); + Inst.eraseFromParent(); + continue; + } } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -3467,6 +3489,82 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } +void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, + MachineRegisterInfo &MRI, + MachineInstr &Inst) const { + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineBasicBlock *MBB = Inst.getParent(); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + + switch (Inst.getOpcode()) { + case AMDGPU::S_PACK_LL_B32_B16: { + // v_pack_b32_f16 flushes denormals if not enabled. Use it if the default + // is to leave them untouched. + // XXX: Does this do anything to NaNs? + if (ST.hasFP16Denormals()) { + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_PACK_B32_F16), ResultReg) + .addImm(0) // src0_modifiers + .add(Src0) // src0 + .addImm(0) // src1_modifiers + .add(Src1) // src2 + .addImm(0) // clamp + .addImm(0); // omod + } else { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // FIXME: Can do a lot better if we know the high bits of src0 or src1 are + // 0. + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) + .addReg(ImmReg, RegState::Kill) + .add(Src0); + + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) + .add(Src1) + .addImm(16) + .addReg(TmpReg, RegState::Kill); + } + + break; + } + case AMDGPU::S_PACK_LH_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) + .addReg(ImmReg, RegState::Kill) + .add(Src0) + .add(Src1); + break; + } + case AMDGPU::S_PACK_HH_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .add(Src0); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) + .add(Src1) + .addReg(ImmReg, RegState::Kill) + .addReg(TmpReg, RegState::Kill); + break; + } + default: + llvm_unreachable("unhandled s_pack_* instruction"); + } + + MachineOperand &Dest = Inst.getOperand(0); + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::addSCCDefUsersToVALUWorklist( MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { // This assumes that all the users of SCC are in the same block diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 73b997df7bd..6723105c07e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -69,6 +69,9 @@ private: MachineInstr &Inst) const; void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const; + void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, + MachineRegisterInfo &MRI, + MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist( unsigned Reg, MachineRegisterInfo &MRI, @@ -498,28 +501,6 @@ public: return !RI.isSGPRReg(MRI, Dest); } - static int operandBitWidth(uint8_t OperandType) { - switch (OperandType) { - case AMDGPU::OPERAND_REG_IMM_INT32: - case AMDGPU::OPERAND_REG_IMM_FP32: - case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: - return 32; - case AMDGPU::OPERAND_REG_IMM_INT64: - case AMDGPU::OPERAND_REG_IMM_FP64: - case AMDGPU::OPERAND_REG_INLINE_C_INT64: - case AMDGPU::OPERAND_REG_INLINE_C_FP64: - return 64; - case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_FP16: - return 16; - default: - llvm_unreachable("unexpected operand type"); - } - } - bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 080532ab114..1fc3fa81f30 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -196,6 +196,21 @@ def si_uniform_br_scc : PatFrag < return isCBranchSCC(N); }]>; +def lshr_rev : PatFrag < + (ops node:$src1, node:$src0), + (srl $src0, $src1) +>; + +def ashr_rev : PatFrag < + (ops node:$src1, node:$src0), + (sra $src0, $src1) +>; + +def lshl_rev : PatFrag < + (ops node:$src1, node:$src0), + (shl $src0, $src1) +>; + multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> { def _glue : SDNode < @@ -658,11 +673,16 @@ def SIOperand { int FLAT_SCR = 0x68; } +// This should be kept in sync with SISrcMods enum def SRCMODS { int NONE = 0; int NEG = 1; int ABS = 2; int NEG_ABS = 3; + + int NEG_HI = ABS; + int OP_SEL_0 = 4; + int OP_SEL_1 = 8; } def DSTCLAMP { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9f7642ceb9a..5ec3cc2102a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -622,6 +622,10 @@ def : BitConvert <v2f16, i32, SReg_32>; def : BitConvert <i32, v2f16, SReg_32>; def : BitConvert <v2i16, v2f16, SReg_32>; def : BitConvert <v2f16, v2i16, SReg_32>; +def : BitConvert <v2f16, f32, SReg_32>; +def : BitConvert <f32, v2f16, SReg_32>; +def : BitConvert <v2i16, f32, SReg_32>; +def : BitConvert <f32, v2i16, SReg_32>; // 64-bit bitcast def : BitConvert <i64, f64, VReg_64>; @@ -775,6 +779,25 @@ def : Pat < (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit >; +def : Pat < + (fneg v2f16:$src), + (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src) +>; + +def : Pat < + (fabs v2f16:$src), + (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src) +>; + +// This is really (fneg (fabs v2f16:$src)) +// +// fabs is not reported as free because there is modifier for it in +// VOP3P instructions, so it is turned into the bit op. +def : Pat < + (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))), + (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit +>; + /********** ================== **********/ /********** Immediate Patterns **********/ /********** ================== **********/ @@ -1107,6 +1130,12 @@ def : Pat< (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0) >; +def : Pat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) +>; + + // Allow integer inputs class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat< (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)), @@ -1116,6 +1145,40 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat< def : ExpPattern<AMDGPUexport, i32, EXP>; def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>; +def : Pat < + (v2i16 (build_vector i16:$src0, i16:$src1)), + (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + +// With multiple uses of the shift, this will duplicate the shift and +// increase register pressure. +def : Pat < + (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), + (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1)) +>; + +def : Pat < + (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))), + (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), + (v2i16 (S_PACK_HH_B32_B16 $src0, $src1)) +>; + +// TODO: Should source modifiers be matched to v_pack_b32_f16? +def : Pat < + (v2f16 (build_vector f16:$src0, f16:$src1)), + (v2f16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + +// def : Pat < +// (v2f16 (scalar_to_vector f16:$src0)), +// (COPY $src0) +// >; + +// def : Pat < +// (v2i16 (scalar_to_vector i16:$src0)), +// (COPY $src0) +// >; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index fa4d2f77731..96d34309913 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -24,26 +24,26 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_ >; let isCommutable = 1 in { -def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; -def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>>; -def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>>; -def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>>; -def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>>; +def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; +def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>; +def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>; +def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>; +def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>; -def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; +def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; -def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; +def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>; +def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>; +def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; } -def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>; +def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; +def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; // XXX - Commutable? def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; |