summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2017-03-31 19:53:03 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2017-03-31 19:53:03 +0000
commit8edfaee7be8ba17cd054a90055c91a18c4d239e3 (patch)
treef2cc356ac8fcce91d9ab50e1eff2e762995700a0 /llvm/lib/Target/AMDGPU
parent3c99441ef4c5325ac3d9e771defae20907245400 (diff)
downloadbcm5719-llvm-8edfaee7be8ba17cd054a90055c91a18c4d239e3.tar.gz
bcm5719-llvm-8edfaee7be8ba17cd054a90055c91a18c4d239e3.zip
AMDGPU: Remove unnecessary ands when f16 is legal
Add a new node to act as a fancy bitcast from f16 operations to i32 that implicitly zero the high 16-bits of the result. Alternatively could try making v2f16 legal and canonicalizing on build_vectors. llvm-svn: 299246
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td1
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp39
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td5
6 files changed, 57 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3f41f4e695e..54ec756bb18 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3491,6 +3491,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_F32_UBYTE3)
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
NODE_NAME_CASE(FP_TO_FP16)
+ NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@@ -3587,7 +3588,10 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
break;
}
- case AMDGPUISD::FP_TO_FP16: {
+ case AMDGPUISD::FP_TO_FP16:
+ case AMDGPUISD::FP16_ZEXT: {
+ unsigned BitWidth = KnownZero.getBitWidth();
+
// High bits are zero.
KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
break;
@@ -3621,7 +3625,9 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
case AMDGPUISD::CARRY:
case AMDGPUISD::BORROW:
return 31;
-
+ case AMDGPUISD::FP_TO_FP16:
+ case AMDGPUISD::FP16_ZEXT:
+ return 16;
default:
return 1;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 66e8464fd7f..cffed2cfeb7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -353,6 +353,9 @@ enum NodeType : unsigned {
// are known 0.
FP_TO_FP16,
+ // Wrapper around fp16 results that are known to zero the high bits.
+ FP16_ZEXT,
+
/// This node is for VLIW targets and it is used to represent a vector
/// that is stored in consecutive registers with the same channel.
/// For example:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index ed2c1b970e8..56f060984f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -112,6 +112,7 @@ def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
+def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 924bae31a12..f1f37f4c0f7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -475,6 +475,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::FCANONICALIZE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -4058,6 +4059,42 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
return SDValue();
}
+static bool fp16SrcZerosHighBits(unsigned Opc) {
+ switch (Opc) {
+ case ISD::SELECT:
+ case ISD::EXTRACT_VECTOR_ELT:
+ return false;
+ default:
+ return true;
+ }
+}
+
+SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (!Subtarget->has16BitInsts() ||
+ DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32)
+ return SDValue();
+
+ SDValue Src = N->getOperand(0);
+ if (Src.getValueType() != MVT::i16)
+ return SDValue();
+
+ // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
+ // FIXME: It is not universally true that the high bits are zeroed on gfx9.
+ if (Src.getOpcode() == ISD::BITCAST) {
+ SDValue BCSrc = Src.getOperand(0);
+ if (BCSrc.getValueType() == MVT::f16 &&
+ fp16SrcZerosHighBits(BCSrc.getOpcode()))
+ return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::performClassCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -4594,6 +4631,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performOrCombine(N, DCI);
case ISD::XOR:
return performXorCombine(N, DCI);
+ case ISD::ZERO_EXTEND:
+ return performZeroExtendCombine(N, DCI);
case AMDGPUISD::FP_CLASS:
return performClassCombine(N, DCI);
case ISD::FCANONICALIZE:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 39f7a671749..550e8a2e38f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -81,6 +81,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9e343aab71e..fde14792ea6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1076,6 +1076,11 @@ def : Pat <
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
+def : Pat <
+ (i32 (AMDGPUfp16_zext f16:$src)),
+ (COPY $src)
+>;
+
def : Pat <
(i32 (trunc i64:$a)),
OpenPOWER on IntegriCloud