diff options
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUFeatures.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 10 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 11 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 3 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 23 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 7 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 13 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 39 |
11 files changed, 94 insertions, 30 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c9fe9ae7e85..d704a0fae0d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -892,10 +892,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // register. ProgInfo.FloatMode = getFPMode(MF); - ProgInfo.IEEEMode = STM.enableIEEEBit(MF); + const SIModeRegisterDefaults Mode = MFI->getMode(); + ProgInfo.IEEEMode = Mode.IEEE; // Make clamp modifier on NaN input returns 0. - ProgInfo.DX10Clamp = STM.enableDX10Clamp(); + ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td index 59637883b12..4e03ebce31c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -54,12 +54,6 @@ class SubtargetFeatureGeneration <string Value, string Subtarget, SubtargetFeature <Value, "Gen", Subtarget#"::"#Value, Value#" GPU generation", Implies>; -def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", - "DX10Clamp", - "true", - "clamp modifier clamps NaNs to 0.0" ->; - def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", "EnablePromoteAlloca", "true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 4177dde35ca..c802d4d3a6b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -45,7 +45,7 @@ GCNSubtarget::~GCNSubtarget() = default; R600Subtarget & R600Subtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { - SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); + SmallString<256> FullFS("+promote-alloca,"); FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -77,7 +77,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // Similarly we want enable-prt-strict-null to be on by default and not to // unset everything else if it is disabled - SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); + SmallString<256> FullFS("+promote-alloca,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; @@ -164,7 +164,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HalfRate64Ops(false), FP64FP16Denormals(false), - DX10Clamp(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), CodeObjectV3(false), @@ -461,7 +460,6 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, FMA(false), CaymanISA(false), CFALUBug(false), - DX10Clamp(false), HasVertexCache(false), R600ALUInst(false), FP64(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 91cc44cbd11..3efde35a0df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -286,7 +286,6 @@ protected: // Dynamially set bits that enable features. bool FP64FP16Denormals; - bool DX10Clamp; bool FlatForGlobal; bool AutoWaitcntBeforeBarrier; bool CodeObjectV3; @@ -531,14 +530,6 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool enableDX10Clamp() const { - return DX10Clamp; - } - - bool enableIEEEBit(const MachineFunction &MF) const { - return AMDGPU::isCompute(MF.getFunction().getCallingConv()); - } - bool useFlatForGlobal() const { return FlatForGlobal; } @@ -970,7 +961,6 @@ private: bool FMA; bool CaymanISA; bool CFALUBug; - bool DX10Clamp; bool HasVertexCache; bool R600ALUInst; bool FP64; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 8fcabeba5ed..37c8de93be5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -611,7 +611,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, - const Function *Callee) const { + const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); const FeatureBitset &CallerBits = TM.getSubtargetImpl(*Caller)->getFeatureBits(); @@ -620,7 +620,14 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; - return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); + if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) + return false; + + // FIXME: dx10_clamp can just take the caller setting, but there seems to be + // no way to support merge for backend defined attributes. + AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); + AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); + return CallerMode.isInlineCompatible(CalleeMode); } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 4a82d3a5879..2ebcbae90bb 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1120,7 +1120,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // omod is ignored by hardware if IEEE bit is enabled. omod also does not // correctly handle signed zeros. // - bool IsIEEEMode = ST->enableIEEEBit(MF); + // FIXME: Also need to check strictfp + bool IsIEEEMode = MFI->getMode().IEEE; bool HasNSZ = MFI->hasNoSignedZerosFPMath(); for (MachineBasicBlock *MBB : depth_first(&MF)) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8437e4bb34e..c3995c2291f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4145,7 +4145,9 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + bool IsIEEEMode = Info->getMode().IEEE; // FIXME: Assert during eslection that this is only selected for // ieee_mode. Currently a combine can produce the ieee version for non-ieee @@ -8300,9 +8302,12 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, if (Cmp == APFloat::cmpGreaterThan) return SDValue(); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + // TODO: Check IEEE bit enabled? EVT VT = Op0.getValueType(); - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the // hardware fmed3 behavior converting to a min. // FIXME: Should this be allowing -0.0? @@ -8436,9 +8441,12 @@ SDValue SITargetLowering::performFMed3Combine(SDNode *N, return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); } + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother // handling no dx10-clamp? - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If NaNs is clamped to 0, we are free to reorder the inputs. if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) @@ -9128,11 +9136,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, if (!CSrc) return SDValue(); + const MachineFunction &MF = DCI.DAG.getMachineFunction(); const APFloat &F = CSrc->getValueAPF(); APFloat Zero = APFloat::getZero(F.getSemantics()); APFloat::cmpResult Cmp0 = F.compare(Zero); if (Cmp0 == APFloat::cmpLessThan || - (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + (Cmp0 == APFloat::cmpUnordered && + MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); } @@ -9967,7 +9977,10 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, bool SNaN, unsigned Depth) const { if (Op.getOpcode() == AMDGPUISD::CLAMP) { - if (Subtarget->enableDX10Clamp()) + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + if (Info->getMode().DX10Clamp) return true; // Clamped to 0. return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 245b5e74ac6..196e6e1d999 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -28,6 +28,7 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + Mode(MF.getFunction()), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index ce103dcc31b..29e2460ae79 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -148,6 +148,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { AMDGPUFunctionArgInfo ArgInfo; + // State of MODE register, assumed FP mode. + AMDGPU::SIModeRegisterDefaults Mode; + // Graphics info. unsigned PSInputAddr = 0; unsigned PSInputEnable = 0; @@ -281,6 +284,10 @@ public: return SpillVGPRs; } + AMDGPU::SIModeRegisterDefaults getMode() const { + return Mode; + } + bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b397554e76d..819c06df158 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1002,6 +1002,19 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, return true; } +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { + *this = getDefaultForCallingConv(F.getCallingConv()); + + StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); + if (!IEEEAttr.empty()) + IEEE = IEEEAttr == "true"; + + StringRef DX10ClampAttr + = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); + if (!DX10ClampAttr.empty()) + DX10Clamp = DX10ClampAttr == "true"; +} + namespace { struct SourceOfDivergence { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 174fec47986..2943722963a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -495,6 +495,45 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); + +// Track defaults for fields in the MODE registser. +struct SIModeRegisterDefaults { + /// Floating point opcodes that support exception flag gathering quiet and + /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10 + /// become IEEE 754- 2008 compliant due to signaling NaN propagation and + /// quieting. + bool IEEE : 1; + + /// Used by the vector ALU to force DX10-style treatment of NaNs: when set, + /// clamp NaN to zero; otherwise, pass NaN through. + bool DX10Clamp : 1; + + // TODO: FP mode fields + + SIModeRegisterDefaults() : + IEEE(true), + DX10Clamp(true) {} + + SIModeRegisterDefaults(const Function &F); + + static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { + SIModeRegisterDefaults Mode; + Mode.DX10Clamp = true; + Mode.IEEE = AMDGPU::isCompute(CC); + return Mode; + } + + bool operator ==(const SIModeRegisterDefaults Other) const { + return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp; + } + + // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should + // be able to override. + bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const { + return *this == CalleeMode; + } +}; + } // end namespace AMDGPU } // end namespace llvm |