diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-03-31 19:53:03 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-03-31 19:53:03 +0000 |
commit | 8edfaee7be8ba17cd054a90055c91a18c4d239e3 (patch) | |
tree | f2cc356ac8fcce91d9ab50e1eff2e762995700a0 /llvm/lib/Target/AMDGPU | |
parent | 3c99441ef4c5325ac3d9e771defae20907245400 (diff) | |
download | bcm5719-llvm-8edfaee7be8ba17cd054a90055c91a18c4d239e3.tar.gz bcm5719-llvm-8edfaee7be8ba17cd054a90055c91a18c4d239e3.zip |
AMDGPU: Remove unnecessary ands when f16 is legal
Add a new node to act as a fancy bitcast from f16 operations to
i32 that implicitly zero the high 16-bits of the result.
Alternatively could try making v2f16 legal and canonicalizing
on build_vectors.
llvm-svn: 299246
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 39 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 5 |
6 files changed, 57 insertions, 2 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3f41f4e695e..54ec756bb18 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3491,6 +3491,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE3) NODE_NAME_CASE(CVT_PKRTZ_F16_F32) NODE_NAME_CASE(FP_TO_FP16) + NODE_NAME_CASE(FP16_ZEXT) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) @@ -3587,7 +3588,10 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( break; } - case AMDGPUISD::FP_TO_FP16: { + case AMDGPUISD::FP_TO_FP16: + case AMDGPUISD::FP16_ZEXT: { + unsigned BitWidth = KnownZero.getBitWidth(); + // High bits are zero. KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); break; @@ -3621,7 +3625,9 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: return 31; - + case AMDGPUISD::FP_TO_FP16: + case AMDGPUISD::FP16_ZEXT: + return 16; default: return 1; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 66e8464fd7f..cffed2cfeb7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -353,6 +353,9 @@ enum NodeType : unsigned { // are known 0. FP_TO_FP16, + // Wrapper around fp16 results that are known to zero the high bits. + FP16_ZEXT, + /// This node is for VLIW targets and it is used to represent a vector /// that is stored in consecutive registers with the same channel. /// For example: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index ed2c1b970e8..56f060984f0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -112,6 +112,7 @@ def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; +def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>; def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 924bae31a12..f1f37f4c0f7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -475,6 +475,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::FCANONICALIZE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); + setTargetDAGCombine(ISD::ZERO_EXTEND); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -4058,6 +4059,42 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, return SDValue(); } +static bool fp16SrcZerosHighBits(unsigned Opc) { + switch (Opc) { + case ISD::SELECT: + case ISD::EXTRACT_VECTOR_ELT: + return false; + default: + return true; + } +} + +SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!Subtarget->has16BitInsts() || + DCI.getDAGCombineLevel() < AfterLegalizeDAG) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32) + return SDValue(); + + SDValue Src = N->getOperand(0); + if (Src.getValueType() != MVT::i16) + return SDValue(); + + // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src + // FIXME: It is not universally true that the high bits are zeroed on gfx9. + if (Src.getOpcode() == ISD::BITCAST) { + SDValue BCSrc = Src.getOperand(0); + if (BCSrc.getValueType() == MVT::f16 && + fp16SrcZerosHighBits(BCSrc.getOpcode())) + return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc); + } + + return SDValue(); +} + SDValue SITargetLowering::performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -4594,6 +4631,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performOrCombine(N, DCI); case ISD::XOR: return performXorCombine(N, DCI); + case ISD::ZERO_EXTEND: + return performZeroExtendCombine(N, DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 39f7a671749..550e8a2e38f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -81,6 +81,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9e343aab71e..fde14792ea6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1076,6 +1076,11 @@ def : Pat < //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// +def : Pat < + (i32 (AMDGPUfp16_zext f16:$src)), + (COPY $src) +>; + def : Pat < (i32 (trunc i64:$a)), |