summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2018-05-16 11:47:30 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2018-05-16 11:47:30 +0000
commit67a9815a5c9d0355020e17c0a494f7eaa1df21fe (patch)
tree3422ea805c0a69f22bcdec55e8c684b282498b59 /llvm/lib
parentca22d427b94de946d4ef32b8acbdfb7e62e7cfa4 (diff)
downloadbcm5719-llvm-67a9815a5c9d0355020e17c0a494f7eaa1df21fe.tar.gz
bcm5719-llvm-67a9815a5c9d0355020e17c0a494f7eaa1df21fe.zip
AMDGPU: Custom lower v4i16/v4f16 vector operations
Avoids stack access. Also handle extract hi elt pattern from truncate + shift to avoid a couple test regressions. llvm-svn: 332453
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp22
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp116
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
4 files changed, 124 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index d00727bf314..988554621c5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3144,6 +3144,28 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
}
}
+ // Equivalent of above for accessing the high element of a vector as an
+ // integer operation.
+ // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
+ if (Src.getOpcode() == ISD::SRL) {
+ if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
+ if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
+ SDValue BV = stripBitcast(Src.getOperand(0));
+ if (BV.getOpcode() == ISD::BUILD_VECTOR &&
+ BV.getValueType().getVectorNumElements() == 2) {
+ SDValue SrcElt = BV.getOperand(1);
+ EVT SrcEltVT = SrcElt.getValueType();
+ if (SrcEltVT.isFloatingPoint()) {
+ SrcElt = DAG.getNode(ISD::BITCAST, SL,
+ SrcEltVT.changeTypeToInteger(), SrcElt);
+ }
+
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
+ }
+ }
+ }
+ }
+
// Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
//
// i16 (trunc (srl i64:x, K)), K <= 16 ->
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 6db83395dc5..e8052de9e8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -137,6 +137,10 @@ public:
return false;
}
+ static inline SDValue stripBitcast(SDValue Val) {
+ return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+ }
+
static bool allUsesHaveSourceMods(const SDNode *N,
unsigned CostThreshold = 4);
bool isFAbsFree(EVT VT) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d34249eefad..f21dd6285eb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -288,13 +288,24 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+
// Avoid stack access for these.
// TODO: Generalize to more vector types.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
@@ -3333,6 +3344,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
case ISD::TRAP:
@@ -4157,34 +4170,72 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDValue InsVal = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
+ EVT VecVT = Vec.getValueType();
+
+ assert(VecVT.getScalarSizeInBits() == 16);
+
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDLoc SL(Op);
+ auto KIdx = dyn_cast<ConstantSDNode>(Idx);
+
+ if (NumElts == 4 && KIdx) {
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
+
+ SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+ DAG.getConstant(0, SL, MVT::i32));
+ SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+ DAG.getConstant(1, SL, MVT::i32));
+
+ SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
+ SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
+
+ unsigned Idx = KIdx->getZExtValue();
+ bool InsertLo = Idx < 2;
+ SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
+ InsertLo ? LoVec : HiVec,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
+ DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
+
+ InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
+
+ SDValue Concat = InsertLo ?
+ DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
+ DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
+
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
+ }
+
+ assert(NumElts == 2 || NumElts == 4);
+
if (isa<ConstantSDNode>(Idx))
return SDValue();
+ EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64;
+
// Avoid stack access for dynamic indexing.
- SDLoc SL(Op);
- SDValue Vec = Op.getOperand(0);
- SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
+ SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
+ SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
// Convert vector index to bit-index.
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
DAG.getConstant(4, SL, MVT::i32));
- SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
-
- SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
- DAG.getConstant(0xffff, SL, MVT::i32),
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+ SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
+ DAG.getConstant(0xffff, SL, IntVT),
ScaledIdx);
- SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
- SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
- DAG.getNOT(SL, BFM, MVT::i32), BCVec);
+ SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+ SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
+ DAG.getNOT(SL, BFM, IntVT), BCVec);
- SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
- return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
+ SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
}
SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -4194,6 +4245,9 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
EVT ResultVT = Op.getValueType();
SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
+ EVT VecVT = Vec.getValueType();
+ unsigned NumElts = VecVT.getVectorNumElements();
+ assert(VecVT.getScalarSizeInBits() == 16 && (NumElts == 2 || NumElts == 4));
DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
@@ -4204,19 +4258,43 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
+ EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64;
SDValue Four = DAG.getConstant(4, SL, MVT::i32);
// Convert vector index to bit-index (* 16)
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Four);
- SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
- SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+ SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
+
+ if (ResultVT == MVT::f16) {
+ SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
+ return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+ }
+
+ return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
+}
+
+SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+ EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
+
+ // Turn into pair of packed build_vectors.
+ // TODO: Special case for constants that can be materialized with s_mov_b64.
+ SDValue Lo = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(0), Op.getOperand(1) });
+ SDValue Hi = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(2), Op.getOperand(3) });
- SDValue Result = Elt;
- if (ResultVT.bitsLT(MVT::i32))
- Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+ SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
+ SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
- return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+ SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
bool
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index fba383dbe4c..3a99994c386 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -84,6 +84,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
OpenPOWER on IntegriCloud