diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-01-28 20:53:42 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-01-28 20:53:42 +0000 |
commit | f639c32739b7ddff3fcf5cc242027bbbe2b6fba9 (patch) | |
tree | 9ed8ab501ac0e8153c4be4989cace13b557310c4 /llvm/lib | |
parent | 7293f9895e10869d0c2dc39c4b2155592a83004d (diff) | |
download | bcm5719-llvm-f639c32739b7ddff3fcf5cc242027bbbe2b6fba9.tar.gz bcm5719-llvm-f639c32739b7ddff3fcf5cc242027bbbe2b6fba9.zip |
AMDGPU: Match some med3 patterns
llvm-svn: 259089
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 93 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 6 |
9 files changed, 124 insertions, 13 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index c8c550cd0e5..b842ba17675 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -169,6 +169,12 @@ def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", [FeatureFP64] >; +def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", + "FPExceptions", + "true", + "Enable floating point exceptions" +>; + def FeatureEnableHugeScratchBuffer : SubtargetFeature< "huge-scratch-buffer", "EnableHugeScratchBuffer", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 48f61fb250b..0d5a8086fe4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -397,7 +397,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, // SI at least has hardware support for floating point exceptions, but no way // of using or handling them is implemented. They are also optional in OpenCL // (Section 7.3) - setHasFloatingPointExceptions(false); + setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); setSelectIsExpensive(false); PredictableSelectIsExpensive = false; @@ -2949,6 +2949,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN3) NODE_NAME_CASE(SMIN3) NODE_NAME_CASE(UMIN3) + NODE_NAME_CASE(FMED3) + NODE_NAME_CASE(SMED3) + NODE_NAME_CASE(UMED3) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index abd2b5e2c5e..34e13f56536 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -257,6 +257,9 @@ enum NodeType : unsigned { FMIN3, SMIN3, UMIN3, + FMED3, + SMED3, + UMED3, URECIP, DIV_SCALE, DIV_FMAS, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 575dfe41365..5e6d3102027 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -209,6 +209,16 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; +def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; + def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index b0dae4a30c7..39b7030aa84 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -66,9 +66,9 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, : AMDGPUGenSubtargetInfo(TT, GPU, FS), DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), - FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - HalfRate64Ops(false), CaymanISA(false), FlatAddressSpace(false), - FlatForGlobal(false), EnableIRStructurizer(true), + FP64Denormals(false), FP32Denormals(false), FPExceptions(false), + FastFMAF32(false), HalfRate64Ops(false), CaymanISA(false), + FlatAddressSpace(false), FlatForGlobal(false), EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 97c521949ca..109ca9f9ce1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -66,6 +66,7 @@ private: bool FP64; bool FP64Denormals; bool FP32Denormals; + bool FPExceptions; bool FastFMAF32; bool HalfRate64Ops; bool CaymanISA; @@ -150,6 +151,10 @@ public: return FP64Denormals; } + bool hasFPExceptions() const { + return FPExceptions; + } + bool hasFastFMAF32() const { return FastFMAF32; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index faecf3c1da9..90f74d48065 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2131,8 +2131,70 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { } } -SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, - DAGCombinerInfo &DCI) const { +static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, + SDLoc SL, + SDValue Op0, + SDValue Op1, + bool Signed) { + ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); + if (!K1) + return SDValue(); + + ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + + if (Signed) { + if (K0->getAPIntValue().sge(K1->getAPIntValue())) + return SDValue(); + } else { + if (K0->getAPIntValue().uge(K1->getAPIntValue())) + return SDValue(); + } + + EVT VT = K0->getValueType(0); + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); +} + +static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { + if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) + return true; + + return DAG.isKnownNeverNaN(Op); +} + +static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, + SDLoc SL, + SDValue Op0, + SDValue Op1) { + ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); + if (!K1) + return SDValue(); + + ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + // Ordered >= (although NaN inputs should have folded away by now). + APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); + if (Cmp == APFloat::cmpGreaterThan) + return SDValue(); + + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a + // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then + // give the other result, which is different from med3 with a NaN input. + SDValue Var = Op0.getOperand(0); + if (!isKnownNeverSNan(DAG, Var)) + return SDValue(); + + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), + Var, SDValue(K0, 0), SDValue(K1, 0)); +} + +SDValue SITargetLowering::performMinMaxCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; unsigned Opc = N->getOpcode(); @@ -2142,7 +2204,8 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - // max(max(a, b), c) + // max(max(a, b), c) -> max3(a, b, c) + // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { SDLoc DL(N); return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), @@ -2153,7 +2216,9 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, Op1); } - // max(a, max(b, c)) + // Try commuted. + // max(a, max(b, c)) -> max3(a, b, c) + // min(a, min(b, c)) -> min3(a, b, c) if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { SDLoc DL(N); return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), @@ -2164,6 +2229,24 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, Op1.getOperand(1)); } + // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) + if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) + return Med3; + } + + if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) + return Med3; + } + + // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) + if (Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM && + N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) + return Res; + } + return SDValue(); } @@ -2217,7 +2300,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMin3Max3Combine(N, DCI); + return performMinMaxCombine(N, DCI); break; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 4587b030cab..d321805ec46 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -54,7 +54,8 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index a4fc2e3374c..a16491e1961 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1695,13 +1695,13 @@ defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32", VOP_I32_I32_I32_I32, AMDGPUumax3 >; defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, AMDGPUfmed3 >; defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUsmed3 >; defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUumed3 >; //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; |