summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2016-12-07 02:42:15 +0000
committerTom Stellard <thomas.stellard@amd.com>2016-12-07 02:42:15 +0000
commit8485fa096e0baf95790a8b1ba7f8ea8806f50a3e (patch)
tree4571875722107bcf446cf548964cfc23a39b08e7 /llvm/lib/Target/AMDGPU
parent2f50fef095298706e7dbe1f47b7d9421c74a790b (diff)
downloadbcm5719-llvm-8485fa096e0baf95790a8b1ba7f8ea8806f50a3e.tar.gz
bcm5719-llvm-8485fa096e0baf95790a8b1ba7f8ea8806f50a3e.zip
AMDGPU : Add S_SETREG instructions to fix fdiv precision issues.
Patch By: Wei Ding Summary: This patch fixes the fdiv precision issues. Reviewers: b-sumner, cfang, wdng, arsenm Subscribers: kzhuravl, nhaehnle, yaxunl, tony-tye Differential Revision: https://reviews.llvm.org/D26424 llvm-svn: 288879
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td13
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp112
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td6
8 files changed, 173 insertions, 12 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index bf493c9fd38..c74fc4a0018 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -135,6 +135,8 @@ private:
void SelectADD_SUB_I64(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
+ void SelectFMA_W_CHAIN(SDNode *N);
+ void SelectFMUL_W_CHAIN(SDNode *N);
SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
uint32_t Offset, uint32_t Width);
@@ -296,6 +298,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectADD_SUB_I64(N);
return;
}
+ case AMDGPUISD::FMUL_W_CHAIN: {
+ SelectFMUL_W_CHAIN(N);
+ return;
+ }
+ case AMDGPUISD::FMA_W_CHAIN: {
+ SelectFMA_W_CHAIN(N);
+ return;
+ }
+
case ISD::SCALAR_TO_VECTOR:
case AMDGPUISD::BUILD_VERTICAL_VECTOR:
case ISD::BUILD_VECTOR: {
@@ -653,6 +664,33 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
CurDAG->RemoveDeadNode(N);
}
+void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
+ SDLoc SL(N);
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+ SDValue Ops[10];
+
+ SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
+ SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+ SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
+ Ops[8] = N->getOperand(0);
+ Ops[9] = N->getOperand(4);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops);
+}
+
+void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
+ SDLoc SL(N);
+ // src0_modifiers, src0, src1_modifiers, src1, clamp, omod
+ SDValue Ops[8];
+
+ SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
+ SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
+ Ops[6] = N->getOperand(0);
+ Ops[7] = N->getOperand(3);
+
+ CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
+}
+
// We need to handle this here because tablegen doesn't support matching
// instructions with multiple outputs.
void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 23a783e7612..8cc995c7b70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2953,6 +2953,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(SETREG)
+ NODE_NAME_CASE(FMA_W_CHAIN)
+ NODE_NAME_CASE(FMUL_W_CHAIN)
NODE_NAME_CASE(CLAMP)
NODE_NAME_CASE(COS_HW)
NODE_NAME_CASE(SIN_HW)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 965d4d14190..f01afefae55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -230,6 +230,10 @@ enum NodeType : unsigned {
// This is SETCC with the full mask result which is used for a compare with a
// result bit per item in the wavefront.
SETCC,
+ SETREG,
+ // FP ops with input and output chain.
+ FMA_W_CHAIN,
+ FMUL_W_CHAIN,
// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
// Denormals handled on some parts.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c8d1bfb1b78..e7b40016e27 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -150,6 +150,19 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
+def AMDGPUSetRegOp : SDTypeProfile<0, 2, [
+ SDTCisInt<0>, SDTCisInt<1>
+]>;
+
+def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
+ SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
+ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
+ SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
SDTIntToFPOp, []>;
def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 7bf6ec22469..3d59f8d82ae 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -173,6 +173,13 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
MI->setDesc(TII->get(Opc));
}
+ // Special case for s_setreg_b32
+ if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
+ MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
+ FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+ return true;
+ }
+
// If we are already folding into another operand of MI, then
// we can't commute the instruction, otherwise we risk making the
// other fold illegal.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 535db8f1522..da60a0f7bdc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -21,6 +21,7 @@
#include "AMDGPU.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -2897,6 +2898,47 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
return SDValue();
}
+static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+ EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
+ if (GlueChain->getNumValues() <= 1) {
+ return DAG.getNode(Opcode, SL, VT, A, B);
+ }
+
+ assert(GlueChain->getNumValues() == 3);
+
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+ switch (Opcode) {
+ default: llvm_unreachable("no chain equivalent for opcode");
+ case ISD::FMUL:
+ Opcode = AMDGPUISD::FMUL_W_CHAIN;
+ break;
+ }
+
+ return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
+ GlueChain.getValue(2));
+}
+
+static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
+ EVT VT, SDValue A, SDValue B, SDValue C,
+ SDValue GlueChain) {
+ if (GlueChain->getNumValues() <= 1) {
+ return DAG.getNode(Opcode, SL, VT, A, B, C);
+ }
+
+ assert(GlueChain->getNumValues() == 3);
+
+ SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
+ switch (Opcode) {
+ default: llvm_unreachable("no chain equivalent for opcode");
+ case ISD::FMA:
+ Opcode = AMDGPUISD::FMA_W_CHAIN;
+ break;
+ }
+
+ return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
+ GlueChain.getValue(2));
+}
+
// Faster 2.5 ULP division that does not support denormals.
SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
@@ -2943,25 +2985,73 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
- SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
- SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
+ SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+ RHS, RHS, LHS);
+ SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
+ LHS, RHS, LHS);
// Denominator is scaled to not be denormal, so using rcp is ok.
- SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
+ SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
+ DenominatorScaled);
+ SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
+ DenominatorScaled);
+
+ const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
+ (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+ (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+
+ const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
+
+ if (!Subtarget->hasFP32Denormals()) {
+ SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
+ SL, MVT::i32);
+ SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
+ DAG.getEntryNode(),
+ EnableDenormValue, BitField);
+ SDValue Ops[3] = {
+ NegDivScale0,
+ EnableDenorm.getValue(0),
+ EnableDenorm.getValue(1)
+ };
+
+ NegDivScale0 = DAG.getMergeValues(Ops, SL);
+ }
+
+ SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
+ ApproxRcp, One, NegDivScale0);
+
+ SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
+ ApproxRcp, Fma0);
- SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
+ SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
+ Fma1, Fma1);
- SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
- SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
+ SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
+ NumeratorScaled, Mul);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
+ SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
- SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
- SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
- SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
+ SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
+ NumeratorScaled, Fma3);
+
+ if (!Subtarget->hasFP32Denormals()) {
+ const SDValue DisableDenormValue =
+ DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+ SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
+ Fma4.getValue(1),
+ DisableDenormValue,
+ BitField,
+ Fma4.getValue(2));
+
+ SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+ DisableDenorm, DAG.getRoot());
+ DAG.setRoot(OutputChain);
+ }
SDValue Scale = NumeratorScaled.getValue(1);
- SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
+ SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
+ Fma4, Fma1, Fma3, Scale);
return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d05ef38be7e..9071ded6567 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1669,6 +1669,8 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
// boundaries prevents incorrect movements of such instructions.
return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+ MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
+ MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
changesVGPRIndexingMode(MI);
}
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2486fbf3edf..0aeb1297d3a 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -590,10 +590,13 @@ def S_GETREG_B32 : SOPK_Pseudo <
>;
}
+let hasSideEffects = 1 in {
+
def S_SETREG_B32 : SOPK_Pseudo <
"s_setreg_b32",
(outs), (ins SReg_32:$sdst, hwreg:$simm16),
- "$simm16, $sdst"
+ "$simm16, $sdst",
+ [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]
>;
// FIXME: Not on SI?
@@ -607,6 +610,7 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
let has_sdst = 0;
}
+} // End hasSideEffects = 1
//===----------------------------------------------------------------------===//
// SOPC Instructions
OpenPOWER on IntegriCloud