summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2018-06-15 15:15:46 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2018-06-15 15:15:46 +0000
commit02dc7e19e2910b2b3a963105684af0f830885360 (patch)
tree4fb36d460658188a9eaac6e5b0d1abe8502a39db /llvm/lib/Target/AMDGPU
parentfa5597b24da47d5ecec4560f3f76f4bb08b405bc (diff)
downloadbcm5719-llvm-02dc7e19e2910b2b3a963105684af0f830885360.tar.gz
bcm5719-llvm-02dc7e19e2910b2b3a963105684af0f830885360.zip
AMDGPU: Make v4i16/v4f16 legal
Some image loads return these, and it's awkward working around them not being legal. llvm-svn: 334835
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td12
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td33
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp206
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td41
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td10
8 files changed, 235 insertions, 92 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index c1c066fd140..79a64c6abb3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -127,7 +127,7 @@ def CC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
+ CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
@@ -144,7 +144,7 @@ def RetCC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
+ CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>
]>;
def CC_AMDGPU : CallingConv<[
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 19106a5ae8d..8685de871de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -73,7 +73,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::i64:
case MVT::f64:
case MVT::v2i32:
- case MVT::v2f32: {
+ case MVT::v2f32:
+ case MVT::v4i16:
+ case MVT::v4f16: {
// Up to SGPR0-SGPR39
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::SGPR_64RegClass, 20);
@@ -94,7 +96,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
case MVT::i64:
case MVT::f64:
case MVT::v2i32:
- case MVT::v2f32: {
+ case MVT::v2f32:
+ case MVT::v4i16:
+ case MVT::v4f16: {
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::VReg_64RegClass, 31);
}
@@ -1234,6 +1238,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {
SmallVector<SDValue, 8> Args;
+ EVT VT = Op.getValueType();
+ if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ SDLoc SL(Op);
+ SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
+ SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
+
+ SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, BV);
+ }
+
for (const SDUse &U : Op->ops())
DAG.ExtractVectorElements(U.get(), Args);
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 7c0b5e9d8f6..18e28b90e06 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1084,8 +1084,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
- defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_XY">;
- defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XYZW">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
@@ -1145,8 +1144,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
- defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_XY">;
- defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XYZW">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
@@ -1571,8 +1569,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
- defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_XY">;
- defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
@@ -1633,8 +1630,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
- defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_XY">;
- defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XYZW">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index b63c120b348..702660c00b5 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -594,12 +594,6 @@ foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
def intr#_pat_v4 : ImageDimPattern<intr, "_V4", v4f32>;
}
-// v2f16 and v4f16 are used as data types to signal that D16 should be used.
-// However, they are not (always) legal types, and the SelectionDAG requires us
-// to legalize them before running any patterns. So we legalize them by
-// converting to an int type of equal size and using an internal 'd16helper'
-// intrinsic instead which signifies both the use of D16 and actually allows
-// this integer-based return type.
multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I,
AMDGPUImageDimIntrinsic d16helper> {
let SubtargetPredicate = HasUnpackedD16VMem in {
@@ -611,7 +605,7 @@ multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I,
let SubtargetPredicate = HasPackedD16VMem in {
def _packed_v1 : ImageDimPattern<I, "_V1", f16, "_D16">;
def _packed_v2 : ImageDimPattern<I, "_V1", v2f16, "_D16">;
- def _packed_v4 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16">;
+ def _packed_v4 : ImageDimPattern<I, "_V2", v4f16, "_D16">;
} // End HasPackedD16VMem.
}
@@ -653,10 +647,7 @@ foreach intr = AMDGPUImageDimGatherIntrinsics in {
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem in {
- def intr#_packed_v4 :
- ImageDimPattern<!cast<AMDGPUImageDimIntrinsic>(
- "int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name),
- "_V2", v2i32, "_D16">;
+ def intr#_packed_v4 : ImageDimPattern<intr, "_V2", v4f16, "_D16">;
} // End HasPackedD16VMem.
}
@@ -703,6 +694,7 @@ multiclass ImageSamplePatterns<SDPatternOperator name, string opcode> {
let SubtargetPredicate = HasPackedD16VMem in {
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), v2f16, "_D16">;
+ defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">;
} // End HasPackedD16VMem.
}
@@ -712,16 +704,15 @@ multiclass ImageSampleAltPatterns<SDPatternOperator name, string opcode> {
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16_gfx80">;
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V4), v4i32, "_D16_gfx80">;
} // End HasUnpackedD16VMem.
-
- let SubtargetPredicate = HasPackedD16VMem in {
- defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
- defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
- } // End HasPackedD16VMem.
}
// ImageGather4 patterns.
multiclass ImageGather4Patterns<SDPatternOperator name, string opcode> {
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
+
+ let SubtargetPredicate = HasPackedD16VMem in {
+ defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">;
+ } // End HasPackedD16VMem.
}
// ImageGather4 alternative patterns for illegal vector half Types.
@@ -730,9 +721,6 @@ multiclass ImageGather4AltPatterns<SDPatternOperator name, string opcode> {
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V4), v4i32, "_D16_gfx80">;
} // End HasUnpackedD16VMem.
- let SubtargetPredicate = HasPackedD16VMem in {
- defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
- } // End HasPackedD16VMem.
}
// ImageLoad for amdgcn.
@@ -766,6 +754,7 @@ multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
let SubtargetPredicate = HasPackedD16VMem in {
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), v2f16, "_D16">;
+ defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">;
} // End HasPackedD16VMem.
}
@@ -775,11 +764,6 @@ multiclass ImageLoadAltPatterns<SDPatternOperator name, string opcode> {
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16_gfx80">;
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4i32, "_D16_gfx80">;
} // End HasUnPackedD16VMem.
-
- let SubtargetPredicate = HasPackedD16VMem in {
- defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
- defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
- } // End HasPackedD16VMem.
}
// ImageStore for amdgcn.
@@ -813,6 +797,7 @@ multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
let SubtargetPredicate = HasPackedD16VMem in {
defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), v2f16, "_D16">;
+ defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">;
} // End HasPackedD16VMem.
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index df396e6ad1f..1b91d743641 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -143,6 +143,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
+ addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
computeRegisterProperties(STI.getRegisterInfo());
@@ -237,7 +239,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
- MVT::v2i64, MVT::v2f64}) {
+ MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -260,6 +262,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
}
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
+
// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
// is expanded to avoid having two separate loops in case the index is a VGPR.
@@ -426,7 +430,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (!Subtarget->hasFP16Denormals())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
- for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
+ for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -488,6 +492,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+
if (!Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
@@ -520,8 +528,31 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
+ setOperationAction(ISD::SHL, MVT::v4i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v4i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v4i16, Custom);
+ setOperationAction(ISD::ADD, MVT::v4i16, Custom);
+ setOperationAction(ISD::SUB, MVT::v4i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
+ setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::FADD, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
}
+ setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f16, Custom);
+
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
@@ -3383,6 +3414,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
// Custom DAG Lowering Operations
//===----------------------------------------------------------------------===//
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4f16);
+
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+
+ SDLoc SL(Op);
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+ SDValue Lo0, Hi0;
+ std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ SDValue Lo1, Hi1;
+ std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
+
+ SDLoc SL(Op);
+
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -3423,6 +3497,24 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:
return lowerDEBUGTRAP(Op, DAG);
+ case ISD::FABS:
+ case ISD::FNEG:
+ return splitUnaryVectorOp(Op, DAG);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FADD:
+ case ISD::FMUL:
+ return splitBinaryVectorOp(Op, DAG);
}
return SDValue();
}
@@ -3630,21 +3722,23 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
bool Unpacked = Subtarget->hasUnpackedD16VMem();
EVT LoadVT = M->getValueType(0);
- EVT UnpackedLoadVT = LoadVT.isVector() ?
- EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- LoadVT.getVectorNumElements()) : LoadVT;
EVT EquivLoadVT = LoadVT;
- if (LoadVT.isVector()) {
- EquivLoadVT = Unpacked ? UnpackedLoadVT :
- getEquivalentMemType(*DAG.getContext(), LoadVT);
+ if (Unpacked && LoadVT.isVector()) {
+ EquivLoadVT = LoadVT.isVector() ?
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ LoadVT.getVectorNumElements()) : LoadVT;
}
// Change from v4f16/v2f16 to EquivLoadVT.
SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
- SDValue Load = DAG.getMemIntrinsicNode(
- IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode,
- DL, VTList, Ops, M->getMemoryVT(), M->getMemOperand());
+ SDValue Load
+ = DAG.getMemIntrinsicNode(
+ IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
+ VTList, Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ if (!Unpacked) // Just adjusted the opcode.
+ return Load;
SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
@@ -3734,8 +3828,10 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::FNEG: {
+ if (N->getValueType(0) != MVT::v2f16)
+ break;
+
SDLoc SL(N);
- assert(N->getValueType(0) == MVT::v2f16);
SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
@@ -3745,8 +3841,10 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::FABS: {
+ if (N->getValueType(0) != MVT::v2f16)
+ break;
+
SDLoc SL(N);
- assert(N->getValueType(0) == MVT::v2f16);
SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
@@ -4247,6 +4345,23 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SDLoc SL(Op);
EVT VT = Op.getValueType();
+ if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
+
+ // Turn into pair of packed build_vectors.
+ // TODO: Special case for constants that can be materialized with s_mov_b64.
+ SDValue Lo = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(0), Op.getOperand(1) });
+ SDValue Hi = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(2), Op.getOperand(3) });
+
+ SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
+ SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
+
+ SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
+
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
SDValue Lo = Op.getOperand(0);
@@ -4913,11 +5028,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_image_load:
case Intrinsic::amdgcn_image_load_mip: {
- EVT LoadVT = Op.getValueType();
- if ((Subtarget->hasUnpackedD16VMem() && LoadVT == MVT::v2f16) ||
- LoadVT == MVT::v4f16) {
- MemSDNode *M = cast<MemSDNode>(Op);
- return adjustLoadValueType(getImageOpcode(IntrID), M, DAG);
+ EVT VT = Op.getValueType();
+ if (Subtarget->hasUnpackedD16VMem() &&
+ VT.isVector() && VT.getScalarSizeInBits() == 16) {
+ return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op),
+ DAG);
}
return SDValue();
@@ -5009,8 +5124,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
}
- if ((Subtarget->hasUnpackedD16VMem() && Op.getValueType() == MVT::v2f16) ||
- Op.getValueType() == MVT::v4f16) {
+ if (Subtarget->hasUnpackedD16VMem() &&
+ Op.getValueType().isVector() &&
+ Op.getValueType().getScalarSizeInBits() == 16) {
return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op),
DAG);
}
@@ -5018,21 +5134,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return SDValue();
}
default:
- EVT LoadVT = Op.getValueType();
- if (LoadVT.getScalarSizeInBits() != 16)
- return SDValue();
-
- const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
- AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID);
- if (D16ImageDimIntr) {
- bool Unpacked = Subtarget->hasUnpackedD16VMem();
- MemSDNode *M = cast<MemSDNode>(Op);
-
- if (isTypeLegal(LoadVT) && (!Unpacked || LoadVT == MVT::f16))
- return SDValue();
-
- return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr,
- M, DAG, true);
+ if (Subtarget->hasUnpackedD16VMem() &&
+ Op.getValueType().isVector() &&
+ Op.getValueType().getScalarSizeInBits() == 16) {
+ if (const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
+ AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID)) {
+ return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr,
+ cast<MemSDNode>(Op), DAG, true);
+ }
}
return SDValue();
@@ -5061,13 +5170,8 @@ SDValue SITargetLowering::handleD16VData(SDValue VData,
return DAG.UnrollVectorOp(ZExt.getNode());
}
- if (isTypeLegal(StoreVT))
- return VData;
-
- // If target supports packed vmem, we just need to workaround
- // the illegal type by casting to an equivalent one.
- EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT);
- return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData);
+ assert(isTypeLegal(StoreVT));
+ return VData;
}
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
@@ -5261,9 +5365,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_image_store:
case Intrinsic::amdgcn_image_store_mip: {
SDValue VData = Op.getOperand(2);
- if ((Subtarget->hasUnpackedD16VMem() &&
- VData.getValueType() == MVT::v2f16) ||
- VData.getValueType() == MVT::v4f16) {
+ EVT VT = VData.getValueType();
+ if (Subtarget->hasUnpackedD16VMem() &&
+ VT.isVector() && VT.getScalarSizeInBits() == 16) {
SDValue Chain = Op.getOperand(0);
VData = handleD16VData(VData, DAG);
@@ -5293,9 +5397,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
if (D16ImageDimIntr) {
SDValue VData = Op.getOperand(2);
EVT StoreVT = VData.getValueType();
- if (((StoreVT == MVT::v2f16 || StoreVT == MVT::v4f16) &&
- Subtarget->hasUnpackedD16VMem()) ||
- !isTypeLegal(StoreVT)) {
+ if (Subtarget->hasUnpackedD16VMem() &&
+ StoreVT.isVector() &&
+ StoreVT.getScalarSizeInBits() == 16) {
SmallVector<SDValue, 12> Ops(Op.getNode()->op_values());
Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
@@ -5521,8 +5625,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() != MVT::i64)
- return SDValue();
+ EVT VT = Op.getValueType();
+ assert(VT.getSizeInBits() == 64);
SDLoc DL(Op);
SDValue Cond = Op.getOperand(0);
@@ -5544,7 +5648,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
- return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}
// Catch division cases where we can use shortcuts with rcp and rsq
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index db5a1dc9641..b4546811d3e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -268,7 +268,10 @@ public:
EVT VT) const override;
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+ SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 31c606123e2..1a7e147d1c5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -795,6 +795,27 @@ foreach Index = 0-15 in {
>;
}
+
+def : Pat <
+ (extract_subvector v4i16:$vec, (i32 0)),
+ (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
+>;
+
+def : Pat <
+ (extract_subvector v4i16:$vec, (i32 2)),
+ (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
+>;
+
+def : Pat <
+ (extract_subvector v4f16:$vec, (i32 0)),
+ (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
+>;
+
+def : Pat <
+ (extract_subvector v4f16:$vec, (i32 2)),
+ (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
+>;
+
let SubtargetPredicate = isGCN in {
// FIXME: Why do only some of these type combinations for SReg and
@@ -834,6 +855,26 @@ def : BitConvert <f64, v2f32, VReg_64>;
def : BitConvert <v2f32, f64, VReg_64>;
def : BitConvert <f64, v2i32, VReg_64>;
def : BitConvert <v2i32, f64, VReg_64>;
+
+// FIXME: Make SGPR
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2i32, VReg_64>;
+def : BitConvert <v2i32, v4f16, VReg_64>;
+def : BitConvert <v2i32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2i32, VReg_64>;
+def : BitConvert <v2f32, v4f16, VReg_64>;
+def : BitConvert <v4f16, v2f32, VReg_64>;
+def : BitConvert <v2f32, v4i16, VReg_64>;
+def : BitConvert <v4i16, v2f32, VReg_64>;
+def : BitConvert <v4i16, f64, VReg_64>;
+def : BitConvert <v4f16, f64, VReg_64>;
+def : BitConvert <f64, v4i16, VReg_64>;
+def : BitConvert <f64, v4f16, VReg_64>;
+def : BitConvert <v4i16, i64, VReg_64>;
+def : BitConvert <v4f16, i64, VReg_64>;
+def : BitConvert <i64, v4i16, VReg_64>;
+def : BitConvert <i64, v4f16, VReg_64>;
+
def : BitConvert <v4i32, v4f32, VReg_128>;
def : BitConvert <v4f32, v4i32, VReg_128>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index e894a3da513..f87a0763b35 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -435,22 +435,22 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
let AllocationPriority = 7;
}
-def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 8;
}
-def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
+def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> {
let isAllocatable = 0;
}
-def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
let AllocationPriority = 8;
}
-def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
(add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
let AllocationPriority = 8;
@@ -505,7 +505,7 @@ def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
}
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> {
let Size = 64;
// Requires 2 v_mov_b32 to copy
OpenPOWER on IntegriCloud