summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp99
1 files changed, 99 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index f780d43475d..e6c3739d902 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1863,12 +1863,15 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
LLT S16 = LLT::scalar(16);
+ LLT S32 = LLT::scalar(32);
if (legalizeFastUnsafeFDIV(MI, MRI, B))
return true;
if (DstTy == S16)
return legalizeFDIV16(MI, MRI, B);
+ if (DstTy == S32)
+ return legalizeFDIV32(MI, MRI, B);
return false;
}
@@ -1966,6 +1969,102 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
return true;
}
+// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
+// to enable denorm mode. When 'Enable' is false, disable denorm mode.
+static void toggleSPDenormMode(bool Enable,
+ const GCNSubtarget &ST,
+ MachineIRBuilder &B) {
+ // Set SP denorm mode to this value.
+ unsigned SPDenormMode =
+ Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+ if (ST.hasDenormModeInst()) {
+ // Preserve default FP64FP16 denorm mode while updating FP32 mode.
+ unsigned DPDenormModeDefault = ST.hasFP64Denormals()
+ ? FP_DENORM_FLUSH_NONE
+ : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+ unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
+ B.buildInstr(AMDGPU::S_DENORM_MODE)
+ .addImm(NewDenormModeValue);
+
+ } else {
+ // Select FP32 bit field in mode register.
+ unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
+ (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+ (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+
+ B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
+ .addImm(SPDenormMode)
+ .addImm(SPDenormModeBitField);
+ }
+}
+
+bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+ Register Res = MI.getOperand(0).getReg();
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+
+ uint16_t Flags = MI.getFlags();
+
+ LLT S32 = LLT::scalar(32);
+ LLT S1 = LLT::scalar(1);
+
+ auto One = B.buildFConstant(S32, 1.0f);
+
+ auto DenominatorScaled =
+ B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
+ .addUse(RHS)
+ .addUse(RHS)
+ .addUse(LHS)
+ .setMIFlags(Flags);
+ auto NumeratorScaled =
+ B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
+ .addUse(LHS)
+ .addUse(RHS)
+ .addUse(LHS)
+ .setMIFlags(Flags);
+
+ auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
+ .addUse(DenominatorScaled.getReg(0))
+ .setMIFlags(Flags);
+ auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
+
+ // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
+ // aren't modeled as reading it.
+ if (!ST.hasFP32Denormals())
+ toggleSPDenormMode(true, ST, B);
+
+ auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
+ auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
+ auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
+ auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
+ auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
+ auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
+
+ if (!ST.hasFP32Denormals())
+ toggleSPDenormMode(false, ST, B);
+
+ auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
+ .addUse(Fma4.getReg(0))
+ .addUse(Fma1.getReg(0))
+ .addUse(Fma3.getReg(0))
+ .addUse(NumeratorScaled.getReg(1))
+ .setMIFlags(Flags);
+
+ B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
+ .addUse(Fmas.getReg(0))
+ .addUse(RHS)
+ .addUse(LHS)
+ .setMIFlags(Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
OpenPOWER on IntegriCloud