summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/R600/AMDGPUISelLowering.cpp8
-rw-r--r--llvm/lib/Target/R600/AMDGPUISelLowering.h1
-rw-r--r--llvm/lib/Target/R600/AMDGPUInstrInfo.td1
-rw-r--r--llvm/lib/Target/R600/AMDGPUInstructions.td5
-rw-r--r--llvm/lib/Target/R600/R600Instructions.td2
-rw-r--r--llvm/lib/Target/R600/SIISelLowering.cpp49
-rw-r--r--llvm/lib/Target/R600/SIInstructions.td3
-rw-r--r--llvm/test/CodeGen/R600/mad-combine.ll567
8 files changed, 595 insertions, 41 deletions
diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
index d96f03aaeb0..18697c8d3f6 100644
--- a/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -133,6 +133,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::FREM, MVT::f32, Custom);
setOperationAction(ISD::FREM, MVT::f64, Custom);
+ // v_mad_f32 does not support denormals according to some sources.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
// Lower floating point store/load to integer store/load to reduce the number
// of patterns in tablegen.
setOperationAction(ISD::STORE, MVT::f32, Promote);
@@ -384,6 +388,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
+
setBooleanContents(ZeroOrNegativeOneBooleanContent);
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
@@ -2611,7 +2618,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(CLAMP)
- NODE_NAME_CASE(MAD)
NODE_NAME_CASE(FMAX_LEGACY)
NODE_NAME_CASE(SMAX)
NODE_NAME_CASE(UMAX)
diff --git a/llvm/lib/Target/R600/AMDGPUISelLowering.h b/llvm/lib/Target/R600/AMDGPUISelLowering.h
index caf96539430..6bc6ca5bbc5 100644
--- a/llvm/lib/Target/R600/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/R600/AMDGPUISelLowering.h
@@ -218,7 +218,6 @@ enum {
DWORDADDR,
FRACT,
CLAMP,
- MAD, // Multiply + add with same result as the separate operations.
// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
// Denormals handled on some parts.
diff --git a/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/llvm/lib/Target/R600/AMDGPUInstrInfo.td
index d657ad05c8c..901eb5110f2 100644
--- a/llvm/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/R600/AMDGPUInstrInfo.td
@@ -78,7 +78,6 @@ def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
>;
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
-def AMDGPUmad : SDNode<"AMDGPUISD::MAD", SDTFPTernaryOp, []>;
// out = max(a, b) a and b are signed ints
def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
diff --git a/llvm/lib/Target/R600/AMDGPUInstructions.td b/llvm/lib/Target/R600/AMDGPUInstructions.td
index 8b5fe8c09db..849b241f63e 100644
--- a/llvm/lib/Target/R600/AMDGPUInstructions.td
+++ b/llvm/lib/Target/R600/AMDGPUInstructions.td
@@ -413,11 +413,6 @@ def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
// Misc Pattern Fragments
//===----------------------------------------------------------------------===//
-def fmad : PatFrag <
- (ops node:$src0, node:$src1, node:$src2),
- (fadd (fmul node:$src0, node:$src1), node:$src2)
->;
-
class Constants {
int TWO_PI = 0x40c90fdb;
int PI = 0x40490fdb;
diff --git a/llvm/lib/Target/R600/R600Instructions.td b/llvm/lib/Target/R600/R600Instructions.td
index 08e3d51fab6..291fb0459e2 100644
--- a/llvm/lib/Target/R600/R600Instructions.td
+++ b/llvm/lib/Target/R600/R600Instructions.td
@@ -920,7 +920,7 @@ class MULADD_Common <bits<5> inst> : R600_3OP <
class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
inst, "MULADD_IEEE",
- [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+ [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))]
>;
class FMA_Common <bits<5> inst> : R600_3OP <
diff --git a/llvm/lib/Target/R600/SIISelLowering.cpp b/llvm/lib/Target/R600/SIISelLowering.cpp
index 2821a0ce475..74287144cc2 100644
--- a/llvm/lib/Target/R600/SIISelLowering.cpp
+++ b/llvm/lib/Target/R600/SIISelLowering.cpp
@@ -680,8 +680,9 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
case MVT::f32:
// This is as fast on some subtargets. However, we always have full rate f32
// mad available which returns the same result as the separate operations
- // which we should prefer over fma.
- return false;
+ // which we should prefer over fma. We can't use this if we want to support
+ // denormals, so only report this in these cases.
+ return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
case MVT::f64:
return true;
default:
@@ -1642,6 +1643,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
if (VT != MVT::f32)
break;
+ // Only do this if we are not trying to support denormals. v_mad_f32 does
+ // not support denormals ever.
+ if (Subtarget->hasFP32Denormals())
+ break;
+
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
@@ -1653,7 +1659,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
SDValue A = LHS.getOperand(0);
if (A == LHS.getOperand(1)) {
const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, RHS);
+ return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
}
}
@@ -1662,11 +1668,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
SDValue A = RHS.getOperand(0);
if (A == RHS.getOperand(1)) {
const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, LHS);
+ return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
}
}
- break;
+ return SDValue();
}
case ISD::FSUB: {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
@@ -1676,30 +1682,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
// Try to get the fneg to fold into the source modifier. This undoes generic
// DAG combines and folds them into the mad.
- if (VT == MVT::f32) {
+ //
+ // Only do this if we are not trying to support denormals. v_mad_f32 does
+ // not support denormals ever.
+ if (VT == MVT::f32 &&
+ !Subtarget->hasFP32Denormals()) {
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
-
- if (LHS.getOpcode() == ISD::FMUL) {
- // (fsub (fmul a, b), c) -> mad a, b, (fneg c)
-
- SDValue A = LHS.getOperand(0);
- SDValue B = LHS.getOperand(1);
- SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS);
-
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
- }
-
- if (RHS.getOpcode() == ISD::FMUL) {
- // (fsub c, (fmul a, b)) -> mad (fneg a), b, c
-
- SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0));
- SDValue B = RHS.getOperand(1);
- SDValue C = LHS;
-
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
- }
-
if (LHS.getOpcode() == ISD::FADD) {
// (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
@@ -1708,7 +1697,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
const SDValue Two = DAG.getConstantFP(2.0, MVT::f32);
SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, Two, A, NegRHS);
+ return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
}
}
@@ -1718,9 +1707,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
SDValue A = RHS.getOperand(0);
if (A == RHS.getOperand(1)) {
const SDValue NegTwo = DAG.getConstantFP(-2.0, MVT::f32);
- return DAG.getNode(AMDGPUISD::MAD, DL, VT, NegTwo, A, LHS);
+ return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
}
}
+
+ return SDValue();
}
break;
diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td
index 2c28e03af88..4c5493ea5e4 100644
--- a/llvm/lib/Target/R600/SIInstructions.td
+++ b/llvm/lib/Target/R600/SIInstructions.td
@@ -2783,9 +2783,6 @@ def : Pat <
(V_MUL_HI_I32 $src0, $src1)
>;
-def : Vop3ModPat<V_MAD_F32, VOP_F32_F32_F32_F32, AMDGPUmad>;
-
-
defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
def : ROTRPattern <V_ALIGNBIT_B32>;
diff --git a/llvm/test/CodeGen/R600/mad-combine.ll b/llvm/test/CodeGen/R600/mad-combine.ll
new file mode 100644
index 00000000000..8c4e09bbea1
--- /dev/null
+++ b/llvm/test/CodeGen/R600/mad-combine.ll
@@ -0,0 +1,567 @@
+; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
+
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+
+; Make sure we don't form mad with denormals
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.fma.f32(float, float, float) #0
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF-NOT: v_fma
+; SI-DENORM-SLOWFMAF-NOT: v_mad
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+
+ %mul = fmul float %a, %b
+ %fma = fadd float %mul, %c
+ store float %fma, float addrspace(1)* %gep.out
+ ret void
+}
+
+; (fadd (fmul x, y), z) -> (fma x, y, z)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+ %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+ %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+ %d = load float addrspace(1)* %gep.3
+
+ %mul = fmul float %a, %b
+ %fma0 = fadd float %mul, %c
+ %fma1 = fadd float %mul, %d
+
+ store float %fma0, float addrspace(1)* %gep.out.0
+ store float %fma1, float addrspace(1)* %gep.out.1
+ ret void
+}
+
+; (fadd x, (fmul y, z)) -> (fma y, z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+
+ %mul = fmul float %a, %b
+ %fma = fadd float %c, %mul
+ store float %fma, float addrspace(1)* %gep.out
+ ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+
+ %mul = fmul float %a, %b
+ %fma = fsub float %mul, %c
+ store float %fma, float addrspace(1)* %gep.out
+ ret void
+}
+
+; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+ %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+ %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+ %d = load float addrspace(1)* %gep.3
+
+ %mul = fmul float %a, %b
+ %fma0 = fsub float %mul, %c
+ %fma1 = fsub float %mul, %d
+ store float %fma0, float addrspace(1)* %gep.out.0
+ store float %fma1, float addrspace(1)* %gep.out.1
+ ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+
+ %mul = fmul float %a, %b
+ %fma = fsub float %c, %mul
+ store float %fma, float addrspace(1)* %gep.out
+ ret void
+}
+
+; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+ %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+ %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+ %d = load float addrspace(1)* %gep.3
+
+ %mul = fmul float %a, %b
+ %fma0 = fsub float %c, %mul
+ %fma1 = fsub float %d, %mul
+ store float %fma0, float addrspace(1)* %gep.out.0
+ store float %fma1, float addrspace(1)* %gep.out.1
+ ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
+
+; SI: buffer_store_dword [[RESULT]]
+define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+
+ %mul = fmul float %a, %b
+ %mul.neg = fsub float -0.0, %mul
+ %fma = fsub float %mul.neg, %c
+
+ store float %fma, float addrspace(1)* %gep.out
+ ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+ %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+ %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+ %d = load float addrspace(1)* %gep.3
+
+ %mul = fmul float %a, %b
+ %mul.neg = fsub float -0.0, %mul
+ %fma0 = fsub float %mul.neg, %c
+ %fma1 = fsub float %mul.neg, %d
+
+ store float %fma0, float addrspace(1)* %gep.out.0
+ store float %fma1, float addrspace(1)* %gep.out.1
+ ret void
+}
+
+; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+
+; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
+
+; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+ %gep.out.0 = getelementptr float addrspace(1)* %out, i32 %tid
+ %gep.out.1 = getelementptr float addrspace(1)* %gep.out.0, i32 1
+
+ %a = load float addrspace(1)* %gep.0
+ %b = load float addrspace(1)* %gep.1
+ %c = load float addrspace(1)* %gep.2
+ %d = load float addrspace(1)* %gep.3
+
+ %mul = fmul float %a, %b
+ %mul.neg = fsub float -0.0, %mul
+ %fma0 = fsub float %mul.neg, %c
+ %fma1 = fsub float %mul, %d
+
+ store float %fma0, float addrspace(1)* %gep.out.0
+ store float %fma1, float addrspace(1)* %gep.out.1
+ ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+ %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4
+ %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %x = load float addrspace(1)* %gep.0
+ %y = load float addrspace(1)* %gep.1
+ %z = load float addrspace(1)* %gep.2
+ %u = load float addrspace(1)* %gep.3
+ %v = load float addrspace(1)* %gep.4
+
+ %tmp0 = fmul float %u, %v
+ %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
+ %tmp2 = fsub float %tmp1, %z
+
+ store float %tmp2, float addrspace(1)* %gep.out
+ ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+; -> (fma (fneg y), z, (fma (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+ %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4
+ %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %x = load float addrspace(1)* %gep.0
+ %y = load float addrspace(1)* %gep.1
+ %z = load float addrspace(1)* %gep.2
+ %u = load float addrspace(1)* %gep.3
+ %v = load float addrspace(1)* %gep.4
+
+ %tmp0 = fmul float %u, %v
+ %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
+ %tmp2 = fsub float %x, %tmp1
+
+ store float %tmp2, float addrspace(1)* %gep.out
+ ret void
+}
+
+; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+ %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4
+ %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %x = load float addrspace(1)* %gep.0
+ %y = load float addrspace(1)* %gep.1
+ %z = load float addrspace(1)* %gep.2
+ %u = load float addrspace(1)* %gep.3
+ %v = load float addrspace(1)* %gep.4
+
+ %tmp0 = fmul float %u, %v
+ %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
+ %tmp2 = fsub float %tmp1, %z
+
+ store float %tmp2, float addrspace(1)* %gep.out
+ ret void
+}
+
+; fold (fsub x, (fmuladd y, z, (fmul u, v)))
+; -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
+
+; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
+
+; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
+; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
+; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
+
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: s_endpgm
+define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() #0
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %gep.2 = getelementptr float addrspace(1)* %gep.0, i32 2
+ %gep.3 = getelementptr float addrspace(1)* %gep.0, i32 3
+ %gep.4 = getelementptr float addrspace(1)* %gep.0, i32 4
+ %gep.out = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %x = load float addrspace(1)* %gep.0
+ %y = load float addrspace(1)* %gep.1
+ %z = load float addrspace(1)* %gep.2
+ %u = load float addrspace(1)* %gep.3
+ %v = load float addrspace(1)* %gep.4
+
+ %tmp0 = fmul float %u, %v
+ %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
+ %tmp2 = fsub float %x, %tmp1
+
+ store float %tmp2, float addrspace(1)* %gep.out
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
OpenPOWER on IntegriCloud