diff options
| author | Tom Stellard <thomas.stellard@amd.com> | 2016-12-07 02:42:15 +0000 |
|---|---|---|
| committer | Tom Stellard <thomas.stellard@amd.com> | 2016-12-07 02:42:15 +0000 |
| commit | 8485fa096e0baf95790a8b1ba7f8ea8806f50a3e (patch) | |
| tree | 4571875722107bcf446cf548964cfc23a39b08e7 /llvm/lib/Target/AMDGPU/SIISelLowering.cpp | |
| parent | 2f50fef095298706e7dbe1f47b7d9421c74a790b (diff) | |
| download | bcm5719-llvm-8485fa096e0baf95790a8b1ba7f8ea8806f50a3e.tar.gz bcm5719-llvm-8485fa096e0baf95790a8b1ba7f8ea8806f50a3e.zip | |
AMDGPU : Add S_SETREG instructions to fix fdiv precision issues.
Patch By: Wei Ding
Summary: This patch fixes the fdiv precision issues.
Reviewers: b-sumner, cfang, wdng, arsenm
Subscribers: kzhuravl, nhaehnle, yaxunl, tony-tye
Differential Revision: https://reviews.llvm.org/D26424
llvm-svn: 288879
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 112 |
1 files changed, 101 insertions, 11 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 535db8f1522..da60a0f7bdc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -21,6 +21,7 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "SIDefines.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -2897,6 +2898,47 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, return SDValue(); } +static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, + EVT VT, SDValue A, SDValue B, SDValue GlueChain) { + if (GlueChain->getNumValues() <= 1) { + return DAG.getNode(Opcode, SL, VT, A, B); + } + + assert(GlueChain->getNumValues() == 3); + + SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); + switch (Opcode) { + default: llvm_unreachable("no chain equivalent for opcode"); + case ISD::FMUL: + Opcode = AMDGPUISD::FMUL_W_CHAIN; + break; + } + + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, + GlueChain.getValue(2)); +} + +static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, + EVT VT, SDValue A, SDValue B, SDValue C, + SDValue GlueChain) { + if (GlueChain->getNumValues() <= 1) { + return DAG.getNode(Opcode, SL, VT, A, B, C); + } + + assert(GlueChain->getNumValues() == 3); + + SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue); + switch (Opcode) { + default: llvm_unreachable("no chain equivalent for opcode"); + case ISD::FMA: + Opcode = AMDGPUISD::FMA_W_CHAIN; + break; + } + + return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, + GlueChain.getValue(2)); +} + // Faster 2.5 ULP division that does not support denormals. SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -2943,25 +2985,73 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); - SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); - SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); + SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, + RHS, RHS, LHS); + SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, + LHS, RHS, LHS); // Denominator is scaled to not be denormal, so using rcp is ok. - SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); + SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, + DenominatorScaled); + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, + DenominatorScaled); + + const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | + (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); + + const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); + + if (!Subtarget->hasFP32Denormals()) { + SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, + SL, MVT::i32); + SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, + DAG.getEntryNode(), + EnableDenormValue, BitField); + SDValue Ops[3] = { + NegDivScale0, + EnableDenorm.getValue(0), + EnableDenorm.getValue(1) + }; + + NegDivScale0 = DAG.getMergeValues(Ops, SL); + } + + SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, + ApproxRcp, One, NegDivScale0); + + SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, + ApproxRcp, Fma0); - SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); + SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, + Fma1, Fma1); - SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One); - SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp); + SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, + NumeratorScaled, Mul); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1); + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); - SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled); - SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); - SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled); + SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, + NumeratorScaled, Fma3); + + if (!Subtarget->hasFP32Denormals()) { + const SDValue DisableDenormValue = + DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, + Fma4.getValue(1), + DisableDenormValue, + BitField, + Fma4.getValue(2)); + + SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, + DisableDenorm, DAG.getRoot()); + DAG.setRoot(OutputChain); + } SDValue Scale = NumeratorScaled.getValue(1); - SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, + Fma4, Fma1, Fma3, Scale); return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } |

