diff options
19 files changed, 153 insertions, 90 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c4fac3eab78..72526cac113 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -69,15 +69,14 @@ using namespace llvm::AMDGPU::HSAMD; // We want to use these instructions, and using fp32 denormals also causes // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. -static uint32_t getFPMode(const MachineFunction &F) { - const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>(); - // TODO: Is there any real use for the flush in only / flush out only modes? +static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) { + // TODO: Is there any real use for the flush in only / flush out only modes? uint32_t FP32Denormals = - ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + Mode.FP32Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; uint32_t FP64Denormals = - ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + Mode.FP64FP16Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | @@ -1033,11 +1032,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( &STM, ProgInfo.NumVGPRsForWavesPerEU); + const SIModeRegisterDefaults Mode = MFI->getMode(); + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode // register. - ProgInfo.FloatMode = getFPMode(MF); + ProgInfo.FloatMode = getFPMode(Mode); - const SIModeRegisterDefaults Mode = MFI->getMode(); ProgInfo.IEEEMode = Mode.IEEE; // Make clamp modifier on NaN input returns 0. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index ed495bd40b8..cf908766caa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -70,6 +70,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, Module *Mod = nullptr; const DataLayout *DL = nullptr; bool HasUnsafeFPMath = false; + bool HasFP32Denormals = false; /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. @@ -575,7 +576,6 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Value *NewFDiv = nullptr; - bool HasDenormals = ST->hasFP32Denormals(); if (VectorType *VT = dyn_cast<VectorType>(Ty)) { NewFDiv = UndefValue::get(VT); @@ -586,7 +586,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; - if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { + if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); @@ -595,7 +595,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { - if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) + if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } @@ -1034,6 +1034,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); DA = &getAnalysis<LegacyDivergenceAnalysis>(); HasUnsafeFPMath = hasUnsafeFPMath(F); + HasFP32Denormals = ST->hasFP32Denormals(F); bool MadeChange = false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index e5732018ebb..75537cbe2ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -128,6 +128,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. const GCNSubtarget *Subtarget; + + // Default FP mode for the current function. + AMDGPU::SIModeRegisterDefaults Mode; + bool EnableLateStructurizeCFG; public: @@ -393,6 +397,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { } #endif Subtarget = &MF.getSubtarget<GCNSubtarget>(); + Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -2104,7 +2109,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); - assert((IsFMA || !Subtarget->hasFP32Denormals()) && + assert((IsFMA || !Mode.FP32Denormals) && "fmad selected with denormals enabled"); // TODO: We can select this with f32 denormals enabled if all the sources are // converted from f16 (in which case fmad isn't legal). diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f669c98969b..e5b94247ee4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1581,8 +1581,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, // float fqneg = -fq; SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); + // float fr = mad(fqneg, fb, fa); - unsigned OpCode = Subtarget->hasFP32Denormals() ? + unsigned OpCode = MFI->getMode().FP32Denormals ? (unsigned)AMDGPUISD::FMAD_FTZ : (unsigned)ISD::FMAD; SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); @@ -1663,8 +1666,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, } if (isTypeLegal(MVT::i64)) { + MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + // Compute denominator reciprocal. - unsigned FMAD = Subtarget->hasFP32Denormals() ? + unsigned FMAD = MFI->getMode().FP32Denormals ? (unsigned)AMDGPUISD::FMAD_FTZ : (unsigned)ISD::FMAD; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 3020e3dec06..9e76c47038b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -100,13 +100,16 @@ class PredicateControl { class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl; -def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">; -def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">; -def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">; -def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">; -def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">; -def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">; +let RecomputePerFunction = 1 in { +def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">; +def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">; +def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">; +def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">; +def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">; +def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; +} + def FMA : Predicate<"Subtarget->hasFMA()">; def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 89ca702f577..940ddff85d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -18,6 +18,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : LocalMemoryObjects(), ExplicitKernArgSize(0), LDSSize(0), + Mode(MF.getFunction(), MF.getSubtarget<GCNSubtarget>()), IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), MemoryBound(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 9818ab1ef14..1933e41c66f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -11,6 +11,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineFunction.h" +#include "Utils/AMDGPUBaseInfo.h" namespace llvm { @@ -28,6 +29,9 @@ protected: /// Number of bytes in the LDS that are being used. unsigned LDSSize; + // State of MODE register, assumed FP mode. + AMDGPU::SIModeRegisterDefaults Mode; + // Kernels + shaders. i.e. functions called by the driver and not called // by other functions. bool IsEntryFunction; @@ -53,6 +57,10 @@ public: return LDSSize; } + AMDGPU::SIModeRegisterDefaults getMode() const { + return Mode; + } + bool isEntryFunction() const { return IsEntryFunction; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 936feb00c62..08878d87fb0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -148,7 +148,12 @@ public: return HasMadMixInsts; } - bool hasFP32Denormals() const { + bool hasFP32Denormals(const Function &F) const { + // FIXME: This should not be a property of the subtarget. This should be a + // property with a default set by the calling convention which can be + // overridden by attributes. For now, use the subtarget feature as a + // placeholder attribute. The function arguments only purpose is to + // discourage use without a function context until this is removed. return FP32Denormals; } @@ -612,11 +617,17 @@ public: unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const; - bool hasFP16Denormals() const { + /// Alias for hasFP64FP16Denormals + bool hasFP16Denormals(const Function &F) const { return FP64FP16Denormals; } - bool hasFP64Denormals() const { + /// Alias for hasFP64FP16Denormals + bool hasFP64Denormals(const Function &F) const { + return FP64FP16Denormals; + } + + bool hasFP64FP16Denormals(const Function &F) const { return FP64FP16Denormals; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 0d44f3be539..a5066a0f669 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -412,7 +412,7 @@ int GCNTTIImpl::getArithmeticInstrCost( if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { // TODO: This is more complicated, unsafe flags etc. - if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) || + if ((SLT == MVT::f32 && !HasFP32Denormals) || (SLT == MVT::f16 && ST->has16BitInsts())) { return LT.first * getQuarterRateInstrCost() * NElts; } @@ -431,7 +431,7 @@ int GCNTTIImpl::getArithmeticInstrCost( if (SLT == MVT::f32 || SLT == MVT::f16) { int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); - if (!ST->hasFP32Denormals()) { + if (!HasFP32Denormals) { // FP mode switches. Cost += 2 * getFullRateInstrCost(); } @@ -671,10 +671,13 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, bool GCNTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); - const FeatureBitset &CallerBits = - TM.getSubtargetImpl(*Caller)->getFeatureBits(); - const FeatureBitset &CalleeBits = - TM.getSubtargetImpl(*Callee)->getFeatureBits(); + const GCNSubtarget *CallerST + = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller)); + const GCNSubtarget *CalleeST + = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee)); + + const FeatureBitset &CallerBits = CallerST->getFeatureBits(); + const FeatureBitset &CalleeBits = CalleeST->getFeatureBits(); FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; @@ -683,8 +686,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, // FIXME: dx10_clamp can just take the caller setting, but there seems to be // no way to support merge for backend defined attributes. - AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); - AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); + AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST); + AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST); return CallerMode.isInlineCompatible(CalleeMode); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index b6e2db454e6..b41f4348f04 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -46,7 +46,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { Triple TargetTriple; - const TargetSubtargetInfo *ST; + const GCNSubtarget *ST; const TargetLoweringBase *TLI; const TargetSubtargetInfo *getST() const { return ST; } @@ -73,6 +73,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { const AMDGPUTargetLowering *TLI; AMDGPUTTIImpl CommonTTI; bool IsGraphicsShader; + bool HasFP32Denormals; const FeatureBitset InlineFeatureIgnoreList = { // Codegen control options which don't matter. @@ -131,7 +132,8 @@ public: ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))), TLI(ST->getTargetLowering()), CommonTTI(TM, F), - IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {} + IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())), + HasFP32Denormals(ST->hasFP32Denormals(F)) { } bool hasBranchDivergence() { return true; } diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 6196be8ec83..ee24022c65f 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -226,10 +226,8 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f64, Expand); } - // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we - // need it for R600. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); + // FIXME: May need no denormals check + setOperationAction(ISD::FMAD, MVT::f32, Legal); if (!Subtarget->hasBFI()) { // fcopysign can be done in a single instruction with BFI. diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 3e5fa7068e0..ca17ba8b722 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1359,8 +1359,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { case AMDGPU::V_MUL_F32_e64: case AMDGPU::V_MUL_F16_e64: { // If output denormals are enabled, omod is ignored. - if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || - (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) + if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals) || + (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16Denormals)) return std::make_pair(nullptr, SIOutMods::NONE); const MachineOperand *RegOp = nullptr; @@ -1389,8 +1389,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const { case AMDGPU::V_ADD_F32_e64: case AMDGPU::V_ADD_F16_e64: { // If output denormals are enabled, omod is ignored. - if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || - (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) + if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals) || + (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16Denormals)) return std::make_pair(nullptr, SIOutMods::NONE); // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1a02037fcd4..c4712198693 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -100,6 +100,16 @@ static cl::opt<bool> DisableLoopAlignment( cl::desc("Do not align and prefetch loops"), cl::init(false)); +static bool hasFP32Denormals(const MachineFunction &MF) { + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + return Info->getMode().FP32Denormals; +} + +static bool hasFP64FP16Denormals(const MachineFunction &MF) { + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + return Info->getMode().FP64FP16Denormals; +} + static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -370,9 +380,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FLOG10, MVT::f16, Custom); } - // v_mad_f32 does not support denormals according to some sources. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); + // v_mad_f32 does not support denormals. We report it as unconditionally + // legal, and the context where it is formed will disallow it when fp32 + // denormals are enabled. + setOperationAction(ISD::FMAD, MVT::f32, Legal); if (!Subtarget->hasBFI()) { // fcopysign can be done in a single instruction with BFI. @@ -510,7 +521,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); - if (!Subtarget->hasFP16Denormals() && STI.hasMadF16()) + if (STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { @@ -772,8 +783,9 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const { return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && - SrcVT.getScalarType() == MVT::f16; + DestVT.getScalarType() == MVT::f32 && + SrcVT.getScalarType() == MVT::f16 && + !hasFP32Denormals(DAG.getMachineFunction()); } bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { @@ -3930,7 +3942,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, // mad available which returns the same result as the separate operations // which we should prefer over fma. We can't use this if we want to support // denormals, so only report this in these cases. - if (Subtarget->hasFP32Denormals()) + if (hasFP32Denormals(MF)) return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. @@ -3939,7 +3951,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, case MVT::f64: return true; case MVT::f16: - return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals(); + return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF); default: break; } @@ -3953,9 +3965,11 @@ bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG, // v_mad_f32/v_mac_f32 do not support denormals. EVT VT = N->getValueType(0); if (VT == MVT::f32) - return !Subtarget->hasFP32Denormals(); - if (VT == MVT::f16) - return !Subtarget->hasFP16Denormals() && Subtarget->hasMadF16(); + return !hasFP32Denormals(DAG.getMachineFunction()); + if (VT == MVT::f16) { + return Subtarget->hasMadF16() && + !hasFP64FP16Denormals(DAG.getMachineFunction()); + } return false; } @@ -7564,7 +7578,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, const SDNodeFlags Flags = Op->getFlags(); bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); - if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) + if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { @@ -7707,7 +7721,7 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, const SDLoc &SL, const GCNSubtarget *ST) { assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); - int DPDenormModeDefault = ST->hasFP64Denormals() + int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction()) ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; @@ -7743,7 +7757,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); - if (!Subtarget->hasFP32Denormals()) { + const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction()); + + if (!HasFP32Denormals) { SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue EnableDenorm; @@ -7787,8 +7803,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3); - if (!Subtarget->hasFP32Denormals()) { - + if (!HasFP32Denormals) { SDValue DisableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue DisableDenormValue = @@ -8762,7 +8777,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, auto F = CFP->getValueAPF(); if (F.isNaN() && F.isSignaling()) return false; - return !F.isDenormal() || denormalsEnabledForType(Op.getValueType()); + return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType()); } // If source is a result of another standard FP operation it is already in @@ -8831,7 +8846,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, // snans will be quieted, so we only need to worry about denormals. if (Subtarget->supportsMinMaxDenormModes() || - denormalsEnabledForType(Op.getValueType())) + denormalsEnabledForType(DAG, Op.getValueType())) return true; // Flushing may be required. @@ -8903,7 +8918,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, LLVM_FALLTHROUGH; } default: - return denormalsEnabledForType(Op.getValueType()) && + return denormalsEnabledForType(DAG, Op.getValueType()) && DAG.isKnownNeverSNaN(Op); } @@ -8914,7 +8929,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, SDValue SITargetLowering::getCanonicalConstantFP( SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { // Flush denormals to 0 if not enabled. - if (C.isDenormal() && !denormalsEnabledForType(VT)) + if (C.isDenormal() && !denormalsEnabledForType(DAG, VT)) return DAG.getConstantFP(0.0, SL, VT); if (C.isNaN()) { @@ -9452,8 +9467,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, // Only do this if we are not trying to support denormals. v_mad_f32 does not // support denormals ever. - if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && !Subtarget->hasFP16Denormals() && + if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) || + (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) && getSubtarget()->hasMadF16())) && isOperationLegal(ISD::FMAD, VT)) return ISD::FMAD; @@ -10964,14 +10979,14 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, return false; } -bool SITargetLowering::denormalsEnabledForType(EVT VT) const { +bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, + EVT VT) const { switch (VT.getScalarType().getSimpleVT().SimpleTy) { case MVT::f32: - return Subtarget->hasFP32Denormals(); + return hasFP32Denormals(DAG.getMachineFunction()); case MVT::f64: - return Subtarget->hasFP64Denormals(); case MVT::f16: - return Subtarget->hasFP16Denormals(); + return hasFP64FP16Denormals(DAG.getMachineFunction()); default: return false; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index b2c2e40923a..d82473fca98 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -393,7 +393,7 @@ public: bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; - bool denormalsEnabledForType(EVT VT) const; + bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const; bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 7dd0f11c95d..0c67b1467a5 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -28,7 +28,6 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), - Mode(MF.getFunction()), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 0d6153d06ce..ef0186f7d57 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -340,9 +340,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { AMDGPUFunctionArgInfo ArgInfo; - // State of MODE register, assumed FP mode. - AMDGPU::SIModeRegisterDefaults Mode; - // Graphics info. unsigned PSInputAddr = 0; unsigned PSInputEnable = 0; @@ -515,10 +512,6 @@ public: : I->second.Lanes[Lane]; } - AMDGPU::SIModeRegisterDefaults getMode() const { - return Mode; - } - bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index c72f93eb739..a4b216f583d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1303,7 +1303,8 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, return true; } -SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F, + const GCNSubtarget &ST) { *this = getDefaultForCallingConv(F.getCallingConv()); StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); @@ -1314,6 +1315,9 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); if (!DX10ClampAttr.empty()) DX10Clamp = DX10ClampAttr == "true"; + + FP32Denormals = ST.hasFP32Denormals(F); + FP64FP16Denormals = ST.hasFP64FP16Denormals(F); } namespace { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f8c082060ff..05bb39235a4 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -676,7 +676,8 @@ struct SIModeRegisterDefaults { FP32Denormals(true), FP64FP16Denormals(true) {} - SIModeRegisterDefaults(const Function &F); + // FIXME: Should not depend on the subtarget + SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST); static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { const bool IsCompute = AMDGPU::isCompute(CC); @@ -695,10 +696,23 @@ struct SIModeRegisterDefaults { FP64FP16Denormals == Other.FP64FP16Denormals; } + /// Returns true if a flag is compatible if it's enabled in the callee, but + /// disabled in the caller. + static bool oneWayCompatible(bool CallerMode, bool CalleeMode) { + return CallerMode == CalleeMode || (CallerMode && !CalleeMode); + } + // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should // be able to override. bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const { - return *this == CalleeMode; + if (DX10Clamp != CalleeMode.DX10Clamp) + return false; + if (IEEE != CalleeMode.IEEE) + return false; + + // Allow inlining denormals enabled into denormals flushed functions. + return oneWayCompatible(FP64FP16Denormals, CalleeMode.FP64FP16Denormals) && + oneWayCompatible(FP32Denormals, CalleeMode.FP32Denormals); } }; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir index 70c5edbd87f..076b1dee6d6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir @@ -37,8 +37,8 @@ body: | liveins: $vgpr0 ; GFX9-LABEL: name: fcanonicalize_f16_flush ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]] + ; GFX9: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, 15360, 0, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s16) = G_TRUNC %0 %2:vgpr(s16) = G_FCANONICALIZE %1 @@ -60,8 +60,8 @@ body: | ; GFX9-LABEL: name: fcanonicalize_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FCANONICALIZE %0 S_ENDPGM 0, implicit %1 @@ -170,8 +170,8 @@ body: | ; GFX9-LABEL: name: fcanonicalize_f64_flush ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F64_]] + ; GFX9: [[V_MUL_F64_:%[0-9]+]]:vreg_64 = V_MUL_F64 0, 4607182418800017408, 0, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F64_]] %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_FCANONICALIZE %0 S_ENDPGM 0, implicit %1 @@ -191,8 +191,8 @@ body: | liveins: $vgpr0 ; GFX9-LABEL: name: fcanonicalize_fabs_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FABS %0 %2:vgpr(s32) = G_FCANONICALIZE %1 @@ -237,8 +237,8 @@ body: | liveins: $vgpr0 ; GFX9-LABEL: name: fcanonicalize_fneg_f32_denorm ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 3212836864, 0, [[COPY]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FNEG %0 %2:vgpr(s32) = G_FCANONICALIZE %1 @@ -283,8 +283,8 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX9: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec - ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec - ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]] + ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[V_XOR_B32_e32_]], 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_FNEG %0 %2:vgpr(s32) = G_FABS %1 |