summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp75
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp106
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h25
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td20
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td63
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td30
11 files changed, 378 insertions, 63 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 23f124b637f..0652dacd9b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -181,12 +181,20 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
}
bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
- if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
- T->getIntegerBitWidth() <= 16)
+ const IntegerType *IntTy = dyn_cast<IntegerType>(T);
+ if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
return true;
- if (!T->isVectorTy())
- return false;
- return needsPromotionToI32(cast<VectorType>(T)->getElementType());
+
+ if (const VectorType *VT = dyn_cast<VectorType>(T)) {
+ // TODO: The set of packed operations is more limited, so may want to
+ // promote some anyway.
+ if (ST->hasVOP3PInsts())
+ return false;
+
+ return needsPromotionToI32(VT->getElementType());
+ }
+
+ return false;
}
// Return true if the op promoted to i32 should have nsw set.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e02ced04f08..fddf94339a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -159,6 +159,10 @@ private:
SDValue &Clamp,
SDValue &Omod) const;
+ bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+ SDValue &Clamp) const;
+
void SelectADD_SUB_I64(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
@@ -305,6 +309,20 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
llvm_unreachable("invalid vector size");
}
+static bool getConstantValue(SDValue N, uint32_t &Out) {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+ Out = C->getAPIntValue().getZExtValue();
+ return true;
+ }
+
+ if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
+ Out = C->getValueAPF().bitcastToAPInt().getZExtValue();
+ return true;
+ }
+
+ return false;
+}
+
void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
@@ -356,7 +374,24 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
+
+ if (VT == MVT::v2i16 || VT == MVT::v2f16) {
+ if (Opc == ISD::BUILD_VECTOR) {
+ uint32_t LHSVal, RHSVal;
+ if (getConstantValue(N->getOperand(0), LHSVal) &&
+ getConstantValue(N->getOperand(1), RHSVal)) {
+ uint32_t K = LHSVal | (RHSVal << 16);
+ CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT,
+ CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32));
+ return;
+ }
+ }
+
+ break;
+ }
+
assert(EltVT.bitsEq(MVT::i32));
+
if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
} else {
@@ -1565,7 +1600,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
-
Src = In;
if (Src.getOpcode() == ISD::FNEG) {
@@ -1579,7 +1613,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
}
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
-
return true;
}
@@ -1633,6 +1666,38 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
+bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ Src = In;
+
+ // FIXME: Look for on separate components
+ if (Src.getOpcode() == ISD::FNEG) {
+ Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ Src = Src.getOperand(0);
+ }
+
+ // Packed instructions do not have abs modifiers.
+
+ // FIXME: Handle abs/neg of individual components.
+ // FIXME: Handle swizzling with op_sel
+ Mods |= SISrcMods::OP_SEL_1;
+
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
+ SDValue &SrcMods,
+ SDValue &Clamp) const {
+ SDLoc SL(In);
+
+ // FIXME: Handle clamp and op_sel
+ Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
+
+ return SelectVOP3PMods(In, Src, SrcMods);
+}
+
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =
*static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f28afa89bd2..edaab0063da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -644,12 +644,17 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
assert(VT.isFloatingPoint());
- return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
- VT == MVT::f16);
+
+ // Packed operations do not have a fabs modifier.
+ return VT == MVT::f32 || VT == MVT::f64 ||
+ (Subtarget->has16BitInsts() && VT == MVT::f16);
}
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
- return isFAbsFree(VT);
+ assert(VT.isFloatingPoint());
+ return VT == MVT::f32 || VT == MVT::f64 ||
+ (Subtarget->has16BitInsts() && VT == MVT::f16) ||
+ (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
}
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index d0c62877524..ba2aed68fb8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -75,6 +75,12 @@ def brtarget : Operand<OtherVT>;
// Misc. PatFrags
//===----------------------------------------------------------------------===//
+class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
+ (ops node:$src0),
+ (op $src0),
+ [{ return N->hasOneUse(); }]
+>;
+
class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
(ops node:$src0, node:$src1),
(op $src0, $src1),
@@ -87,6 +93,7 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
[{ return N->hasOneUse(); }]
>;
+def trunc_oneuse : HasOneUseUnaryOp<trunc>;
let Properties = [SDNPCommutative, SDNPAssociative] in {
def smax_oneuse : HasOneUseBinOp<smax>;
@@ -101,6 +108,8 @@ def xor_oneuse : HasOneUseBinOp<xor>;
} // Properties = [SDNPCommutative, SDNPAssociative]
def sub_oneuse : HasOneUseBinOp<sub>;
+
+def srl_oneuse : HasOneUseBinOp<srl>;
def shl_oneuse : HasOneUseBinOp<shl>;
def select_oneuse : HasOneUseTernaryOp<select>;
@@ -440,6 +449,7 @@ int PI = 0x40490fdb;
int TWO_PI_INV = 0x3e22f983;
int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
int FP16_ONE = 0x3C00;
+int V2FP16_ONE = 0x3C003C00;
int FP32_ONE = 0x3f800000;
int FP32_NEG_ONE = 0xbf800000;
int FP64_ONE = 0x3ff0000000000000;
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 73cd96b1180..0a0584d5074 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -367,8 +367,6 @@ void SIFoldOperands::foldOperand(
const TargetRegisterClass *FoldRC =
TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
- APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
- OpToFold.getImm());
// Split 64-bit constants into 32-bits for folding.
if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
@@ -378,21 +376,25 @@ void SIFoldOperands::foldOperand(
MRI->getRegClass(UseReg) :
TRI->getPhysRegClass(UseReg);
- assert(Imm.getBitWidth() == 64);
-
if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
return;
+ APInt Imm(64, OpToFold.getImm());
if (UseOp.getSubReg() == AMDGPU::sub0) {
Imm = Imm.getLoBits(32);
} else {
assert(UseOp.getSubReg() == AMDGPU::sub1);
Imm = Imm.getHiBits(32);
}
+
+ MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+ return;
}
- MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
- tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+
+
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
}
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 109870781f5..0959707ac9a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -402,6 +402,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
}
+ // XXX - Do these do anything? Vector constants turn into build_vector.
+ setOperationAction(ISD::Constant, MVT::v2i16, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+
setOperationAction(ISD::STORE, MVT::v2i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
setOperationAction(ISD::STORE, MVT::v2f16, Promote);
@@ -411,6 +415,46 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
+
+ setOperationAction(ISD::AND, MVT::v2i16, Promote);
+ AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
+ setOperationAction(ISD::OR, MVT::v2i16, Promote);
+ AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
+ setOperationAction(ISD::XOR, MVT::v2i16, Promote);
+ AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
+ setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
+ setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
+
+ setOperationAction(ISD::ADD, MVT::v2i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v2i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v2i16, Legal);
+ setOperationAction(ISD::SHL, MVT::v2i16, Legal);
+ setOperationAction(ISD::SRL, MVT::v2i16, Legal);
+ setOperationAction(ISD::SRA, MVT::v2i16, Legal);
+ setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
+ setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
+
+ setOperationAction(ISD::FADD, MVT::v2f16, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
+ setOperationAction(ISD::FMA, MVT::v2f16, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+ // This isn't really legal, but this avoids the legalizer unrolling it (and
+ // allows matching fneg (fabs x) patterns)
+ setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
}
setTargetDAGCombine(ISD::FADD);
@@ -428,6 +472,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::FCANONICALIZE);
+ setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -3965,7 +4010,7 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
SDValue SITargetLowering::performFCanonicalizeCombine(
SDNode *N,
DAGCombinerInfo &DCI) const {
- ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+ ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
if (!CFP)
return SDValue();
@@ -3975,13 +4020,14 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
// Flush denormals to 0 if not enabled.
if (C.isDenormal()) {
EVT VT = N->getValueType(0);
- if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+ EVT SVT = VT.getScalarType();
+ if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
return DAG.getConstantFP(0.0, SDLoc(N), VT);
- if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+ if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
return DAG.getConstantFP(0.0, SDLoc(N), VT);
- if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
+ if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
return DAG.getConstantFP(0.0, SDLoc(N), VT);
}
@@ -4001,7 +4047,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
}
- return SDValue(CFP, 0);
+ return N->getOperand(0);
}
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
@@ -4270,7 +4316,6 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
- assert(!VT.isVector());
SDLoc SL(N);
SDValue LHS = N->getOperand(0);
@@ -4509,6 +4554,24 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFMed3Combine(N, DCI);
case AMDGPUISD::CVT_PKRTZ_F16_F32:
return performCvtPkRTZCombine(N, DCI);
+ case ISD::SCALAR_TO_VECTOR: {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
+ if (VT == MVT::v2i16 || VT == MVT::v2f16) {
+ SDLoc SL(N);
+ SDValue Src = N->getOperand(0);
+ EVT EltVT = Src.getValueType();
+ if (EltVT == MVT::f16)
+ Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
+
+ SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
+ return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
+ }
+
+ break;
+ }
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ead43731809..1833b324915 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1839,17 +1839,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
// would be for any 32-bit integer operand, but would not be for a 64-bit one.
int64_t Imm = MO.getImm();
- switch (operandBitWidth(OperandType)) {
- case 32: {
+ switch (OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
int32_t Trunc = static_cast<int32_t>(Imm);
return Trunc == Imm &&
AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
}
- case 64: {
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
return AMDGPU::isInlinableLiteral64(MO.getImm(),
ST.hasInv2PiInlineImm());
}
- case 16: {
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
// where 16-bit instructions are not legal.
@@ -1862,6 +1871,11 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return false;
}
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+ uint32_t Trunc = static_cast<uint32_t>(Imm);
+ return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
+ }
default:
llvm_unreachable("invalid bitwidth");
}
@@ -3117,6 +3131,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
case AMDGPU::S_BFE_U64:
case AMDGPU::S_BFM_B64:
llvm_unreachable("Moving this op to VALU not implemented");
+
+ case AMDGPU::S_PACK_LL_B32_B16:
+ case AMDGPU::S_PACK_LH_B32_B16:
+ case AMDGPU::S_PACK_HH_B32_B16: {
+ movePackToVALU(Worklist, MRI, Inst);
+ Inst.eraseFromParent();
+ continue;
+ }
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -3467,6 +3489,82 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
}
}
+void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineRegisterInfo &MRI,
+ MachineInstr &Inst) const {
+ unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineBasicBlock *MBB = Inst.getParent();
+ MachineOperand &Src0 = Inst.getOperand(1);
+ MachineOperand &Src1 = Inst.getOperand(2);
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ switch (Inst.getOpcode()) {
+ case AMDGPU::S_PACK_LL_B32_B16: {
+ // v_pack_b32_f16 flushes denormals if not enabled. Use it if the default
+ // is to leave them untouched.
+ // XXX: Does this do anything to NaNs?
+ if (ST.hasFP16Denormals()) {
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_PACK_B32_F16), ResultReg)
+ .addImm(0) // src0_modifiers
+ .add(Src0) // src0
+ .addImm(0) // src1_modifiers
+ .add(Src1) // src2
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ } else {
+ unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
+ // 0.
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+ .addImm(0xffff);
+
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
+ .addReg(ImmReg, RegState::Kill)
+ .add(Src0);
+
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
+ .add(Src1)
+ .addImm(16)
+ .addReg(TmpReg, RegState::Kill);
+ }
+
+ break;
+ }
+ case AMDGPU::S_PACK_LH_B32_B16: {
+ unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+ .addImm(0xffff);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
+ .addReg(ImmReg, RegState::Kill)
+ .add(Src0)
+ .add(Src1);
+ break;
+ }
+ case AMDGPU::S_PACK_HH_B32_B16: {
+ unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+ .addImm(16)
+ .add(Src0);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+ .addImm(0xffff);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
+ .add(Src1)
+ .addReg(ImmReg, RegState::Kill)
+ .addReg(TmpReg, RegState::Kill);
+ break;
+ }
+ default:
+ llvm_unreachable("unhandled s_pack_* instruction");
+ }
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MRI.replaceRegWith(Dest.getReg(), ResultReg);
+ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
void SIInstrInfo::addSCCDefUsersToVALUWorklist(
MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
// This assumes that all the users of SCC are in the same block
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 73b997df7bd..6723105c07e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -69,6 +69,9 @@ private:
MachineInstr &Inst) const;
void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
MachineInstr &Inst) const;
+ void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+ MachineRegisterInfo &MRI,
+ MachineInstr &Inst) const;
void addUsersToMoveToVALUWorklist(
unsigned Reg, MachineRegisterInfo &MRI,
@@ -498,28 +501,6 @@ public:
return !RI.isSGPRReg(MRI, Dest);
}
- static int operandBitWidth(uint8_t OperandType) {
- switch (OperandType) {
- case AMDGPU::OPERAND_REG_IMM_INT32:
- case AMDGPU::OPERAND_REG_IMM_FP32:
- case AMDGPU::OPERAND_REG_INLINE_C_INT32:
- case AMDGPU::OPERAND_REG_INLINE_C_FP32:
- return 32;
- case AMDGPU::OPERAND_REG_IMM_INT64:
- case AMDGPU::OPERAND_REG_IMM_FP64:
- case AMDGPU::OPERAND_REG_INLINE_C_INT64:
- case AMDGPU::OPERAND_REG_INLINE_C_FP64:
- return 64;
- case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- case AMDGPU::OPERAND_REG_IMM_INT16:
- case AMDGPU::OPERAND_REG_IMM_FP16:
- return 16;
- default:
- llvm_unreachable("unexpected operand type");
- }
- }
-
bool isInlineConstant(const APInt &Imm) const;
bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 080532ab114..1fc3fa81f30 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -196,6 +196,21 @@ def si_uniform_br_scc : PatFrag <
return isCBranchSCC(N);
}]>;
+def lshr_rev : PatFrag <
+ (ops node:$src1, node:$src0),
+ (srl $src0, $src1)
+>;
+
+def ashr_rev : PatFrag <
+ (ops node:$src1, node:$src0),
+ (sra $src0, $src1)
+>;
+
+def lshl_rev : PatFrag <
+ (ops node:$src1, node:$src0),
+ (shl $src0, $src1)
+>;
+
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
def _glue : SDNode <
@@ -658,11 +673,16 @@ def SIOperand {
int FLAT_SCR = 0x68;
}
+// This should be kept in sync with SISrcMods enum
def SRCMODS {
int NONE = 0;
int NEG = 1;
int ABS = 2;
int NEG_ABS = 3;
+
+ int NEG_HI = ABS;
+ int OP_SEL_0 = 4;
+ int OP_SEL_1 = 8;
}
def DSTCLAMP {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9f7642ceb9a..5ec3cc2102a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -622,6 +622,10 @@ def : BitConvert <v2f16, i32, SReg_32>;
def : BitConvert <i32, v2f16, SReg_32>;
def : BitConvert <v2i16, v2f16, SReg_32>;
def : BitConvert <v2f16, v2i16, SReg_32>;
+def : BitConvert <v2f16, f32, SReg_32>;
+def : BitConvert <f32, v2f16, SReg_32>;
+def : BitConvert <v2i16, f32, SReg_32>;
+def : BitConvert <f32, v2i16, SReg_32>;
// 64-bit bitcast
def : BitConvert <i64, f64, VReg_64>;
@@ -775,6 +779,25 @@ def : Pat <
(S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
>;
+def : Pat <
+ (fneg v2f16:$src),
+ (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
+>;
+
+def : Pat <
+ (fabs v2f16:$src),
+ (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
+>;
+
+// This is really (fneg (fabs v2f16:$src))
+//
+// fabs is not reported as free because there is modifier for it in
+// VOP3P instructions, so it is turned into the bit op.
+def : Pat <
+ (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
+ (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+>;
+
/********** ================== **********/
/********** Immediate Patterns **********/
/********** ================== **********/
@@ -1107,6 +1130,12 @@ def : Pat<
(V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
>;
+def : Pat<
+ (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+ (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+>;
+
+
// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat<
(node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
@@ -1116,6 +1145,40 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat<
def : ExpPattern<AMDGPUexport, i32, EXP>;
def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
+def : Pat <
+ (v2i16 (build_vector i16:$src0, i16:$src1)),
+ (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
+// With multiple uses of the shift, this will duplicate the shift and
+// increase register pressure.
+def : Pat <
+ (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
+ (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
+>;
+
+def : Pat <
+ (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
+ (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
+ (v2i16 (S_PACK_HH_B32_B16 $src0, $src1))
+>;
+
+// TODO: Should source modifiers be matched to v_pack_b32_f16?
+def : Pat <
+ (v2f16 (build_vector f16:$src0, f16:$src1)),
+ (v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
+// def : Pat <
+// (v2f16 (scalar_to_vector f16:$src0)),
+// (COPY $src0)
+// >;
+
+// def : Pat <
+// (v2i16 (scalar_to_vector i16:$src0)),
+// (COPY $src0)
+// >;
+
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index fa4d2f77731..96d34309913 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -24,26 +24,26 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_
>;
let isCommutable = 1 in {
-def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
-def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>>;
-def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>>;
-def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>>;
-def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>>;
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
+def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
+def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
-def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
-def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
}
-def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
-def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
+def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
+def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
// XXX - Commutable?
def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
OpenPOWER on IntegriCloud