diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 59 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 13 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 80 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstructions.td | 16 |
12 files changed, 163 insertions, 29 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 0de72734b71..17d6b460a80 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -257,6 +257,12 @@ def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", [FeatureFP64FP16Denormals] >; +def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", + "DX10Clamp", + "true", + "clamp modifier clamps NaNs to 0.0" +>; + def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", "FPExceptions", "true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c3ff2e32254..577509d31ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -586,7 +586,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.IEEEMode = STM.enableIEEEBit(MF); // Make clamp modifier on NaN input returns 0. - ProgInfo.DX10Clamp = 1; + ProgInfo.DX10Clamp = STM.enableDX10Clamp(); const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); ProgInfo.ScratchSize = FrameInfo.getStackSize(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 12c17c3e9eb..9e3ac8b871e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1012,22 +1012,29 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, EVT VT = Op.getValueType(); switch (IntrinsicID) { - default: return Op; - case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name. - return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + default: return Op; + case AMDGPUIntrinsic::AMDGPU_clamp: { + // Deprecated in favor of emitting min/max combo or fmed3. + ConstantFPSDNode *CSrc1 = dyn_cast<ConstantFPSDNode>(Op.getOperand(2)); + ConstantFPSDNode *CSrc2 = dyn_cast<ConstantFPSDNode>(Op.getOperand(3)); + if (CSrc1 && CSrc2 && CSrc1->isZero() && CSrc2->isExactlyValue(1.0)) + return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_bfe_i32: - return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); + SDValue Max = DAG.getNode(ISD::FMAXNUM, DL, VT, Op.getOperand(1), + Op.getOperand(2)); + return DAG.getNode(ISD::FMINNUM, DL, VT, Max, Op.getOperand(3)); + } + case AMDGPUIntrinsic::AMDGPU_bfe_i32: + return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); - case AMDGPUIntrinsic::AMDGPU_bfe_u32: - return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); + case AMDGPUIntrinsic::AMDGPU_bfe_u32: + return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)); } } @@ -2445,6 +2452,28 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SN->getBasePtr(), SN->getMemOperand()); } +SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + if (!CSrc) + return SDValue(); + + const APFloat &F = CSrc->getValueAPF(); + APFloat Zero = APFloat::getZero(F.getSemantics()); + APFloat::cmpResult Cmp0 = F.compare(Zero); + if (Cmp0 == APFloat::cmpLessThan || + (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); + } + + APFloat One(F.getSemantics(), "1.0"); + APFloat::cmpResult Cmp1 = F.compare(One); + if (Cmp1 == APFloat::cmpGreaterThan) + return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); + + return SDValue(CSrc, 0); +} + /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -3323,6 +3352,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); + case AMDGPUISD::CLAMP: + return performClampCombine(N, DCI); } return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 58ac09f2698..fb487f914ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -70,6 +70,7 @@ protected: bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, @@ -238,7 +239,11 @@ enum NodeType : unsigned { RETURN, DWORDADDR, FRACT, + + /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output + /// modifier behavior with dx10_enable. CLAMP, + // This is SETCC with the full mask result which is used for a compare with a // result bit per item in the wavefront. SETCC, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 91341ef88ea..a081c30ef81 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -92,7 +92,7 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] >; -def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; // out = min(a, b) a and b are floats, where a nan comparison fails. def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 7d2a52bccc8..e76891c3bed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -452,7 +452,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "CLAMP $dst, $src0", - [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] + [(set f32:$dst, (AMDGPUclamp f32:$src0))] >; class FABS <RegisterClass rc> : AMDGPUShaderInst < diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 32b04fe7a1e..a719931eff0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -42,7 +42,7 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // for SI has the unhelpful behavior that it unsets everything else if you // disable it. - SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,"); + SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; @@ -89,6 +89,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FP32Denormals(false), FP64FP16Denormals(false), FPExceptions(false), + DX10Clamp(false), FlatForGlobal(false), UnalignedScratchAccess(false), UnalignedBufferAccess(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 433c295feaf..8f1aaa1d893 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -103,6 +103,7 @@ protected: bool FP32Denormals; bool FP64FP16Denormals; bool FPExceptions; + bool DX10Clamp; bool FlatForGlobal; bool UnalignedScratchAccess; bool UnalignedBufferAccess; @@ -294,10 +295,6 @@ public: return DumpCode; } - bool enableIEEEBit(const MachineFunction &MF) const { - return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); - } - /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, @@ -323,6 +320,14 @@ public: return FPExceptions; } + bool enableDX10Clamp() const { + return DX10Clamp; + } + + bool enableIEEEBit(const MachineFunction &MF) const { + return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); + } + bool useFlatForGlobal() const { return FlatForGlobal; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 60acf4fb1c0..40a28203000 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3995,8 +3995,10 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { return DAG.isKnownNeverNaN(Op); } -static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1) { +SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Op0, + SDValue Op1) const { ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); if (!K1) return SDValue(); @@ -4010,6 +4012,20 @@ static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, if (Cmp == APFloat::cmpGreaterThan) return SDValue(); + // TODO: Check IEEE bit enabled? + EVT VT = K0->getValueType(0); + if (Subtarget->enableDX10Clamp()) { + // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the + // hardware fmed3 behavior converting to a min. + // FIXME: Should this be allowing -0.0? + if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); + } + + // No med3 for f16, but clamp is possible. + if (VT == MVT::f16) + return SDValue(); + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then // give the other result, which is different from med3 with a NaN input. @@ -4074,7 +4090,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && - N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + (N->getValueType(0) == MVT::f32 || + (N->getValueType(0) == MVT::f16 && Subtarget->has16BitInsts())) && + Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) return Res; } @@ -4082,6 +4100,60 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, return SDValue(); } +static bool isClampZeroToOne(SDValue A, SDValue B) { + if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) { + if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) { + // FIXME: Should this be allowing -0.0? + return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || + (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); + } + } + + return false; +} + +// FIXME: Should only worry about snans for version with chain. +SDValue SITargetLowering::performFMed3Combine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and + // NaNs. With a NaN input, the order of the operands may change the result. + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + SDValue Src2 = N->getOperand(2); + + if (isClampZeroToOne(Src0, Src1)) { + // const_a, const_b, x -> clamp is safe in all cases including signaling + // nans. + // FIXME: Should this be allowing -0.0? + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); + } + + // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother + // handling no dx10-clamp? + if (Subtarget->enableDX10Clamp()) { + // If NaNs is clamped to 0, we are free to reorder the inputs. + + if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) + std::swap(Src0, Src1); + + if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2)) + std::swap(Src1, Src2); + + if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) + std::swap(Src0, Src1); + + if (isClampZeroToOne(Src1, Src2)) + return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0); + } + + return SDValue(); +} + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -4348,6 +4420,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::CVT_F32_UBYTE2: case AMDGPUISD::CVT_F32_UBYTE3: return performCvtF32UByteNCombine(N, DCI); + case AMDGPUISD::FMED3: + return performFMed3Combine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 037b6f730c5..fe86f8e2b65 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -84,7 +84,10 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 94580543afd..aa6d677f466 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -625,6 +625,7 @@ def SRCMODS { def DSTCLAMP { int NONE = 0; + int ENABLE = 1; } def DSTOMOD { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 1d09880147d..4bd759012a3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -647,12 +647,20 @@ def : BitConvert <v16f32, v16i32, VReg_512>; /********** Src & Dst modifiers **********/ /********** =================== **********/ -def : Pat < - (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod), - (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod) + +// If denormals are not enabled, it only impacts the compare of the +// inputs. The output result is not flushed. +class ClampPat<Instruction inst, ValueType vt> : Pat < + (vt (AMDGPUclamp + (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))), + (inst i32:$src0_modifiers, vt:$src0, + i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod) >; +// TODO: Does f64 support clamp? +def : ClampPat<V_MAX_F32_e64, f32>; +def : ClampPat<V_MAX_F16_e64, f16>; + /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ |

