summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp27
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll62
3 files changed, 79 insertions, 14 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 22cede59086..d4b6a5fe802 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -359,6 +359,10 @@ public:
return FP64FP16Denormals;
}
+ bool supportsMinMaxDenormModes() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX9;
+ }
+
bool hasFPExceptions() const {
return FPExceptions;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2ba570b9ebb..af4a2a9c679 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4624,8 +4624,8 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
return DAG.isKnownNeverNaN(Op);
}
-static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
- unsigned MaxDepth=5) {
+static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+ const SISubtarget *ST, unsigned MaxDepth=5) {
// If source is a result of another standard FP operation it is already in
// canonical form.
@@ -4663,7 +4663,7 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
case ISD::FNEG:
case ISD::FABS:
return (MaxDepth > 0) &&
- isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
+ isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
case ISD::FSIN:
case ISD::FCOS:
@@ -4672,16 +4672,19 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
// In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
// For such targets need to check their input recursively.
- // TODO: on GFX9+ we could return true without checking provided no-nan
- // mode, since canonicalization is also used to quiet sNaNs.
case ISD::FMINNUM:
case ISD::FMAXNUM:
case ISD::FMINNAN:
case ISD::FMAXNAN:
+ if (ST->supportsMinMaxDenormModes() &&
+ DAG.isKnownNeverNaN(Op.getOperand(0)) &&
+ DAG.isKnownNeverNaN(Op.getOperand(1)))
+ return true;
+
return (MaxDepth > 0) &&
- isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
- isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
+ isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
+ isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
case ISD::ConstantFP: {
auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
@@ -4700,11 +4703,19 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
if (!CFP) {
SDValue N0 = N->getOperand(0);
+ EVT VT = N0.getValueType().getScalarType();
+ auto ST = getSubtarget();
+
+ if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
+ (VT == MVT::f64 && ST->hasFP64Denormals()) ||
+ (VT == MVT::f16 && ST->hasFP16Denormals())) &&
+ DAG.isKnownNeverNaN(N0))
+ return N0;
bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
- isCanonicalized(N0, getSubtarget()))
+ isCanonicalized(DAG, N0, ST))
return N0;
return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 5383bbe71ae..5ffa45595e7 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -347,7 +347,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace
}
; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -388,9 +390,11 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace
}
; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
-; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-NOT: 1.0
define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -402,9 +406,11 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa
}
; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
-; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GFX9: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; VI: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
+; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-NOT: 1.0
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
@@ -465,6 +471,49 @@ entry:
ret float %canonicalized
}
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
+; GFX9-DENORM: flat_load_dword [[V:v[0-9]+]],
+; GFX9-DENORM: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GFX9-DENORM-NOT: 1.0
+; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %v = load float, float addrspace(1)* %gep, align 4
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+ store float %canonicalized, float addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
+; GCN: flat_load_dwordx2 [[V:v\[[0-9:]+\]]],
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+ %v = load double, double addrspace(1)* %gep, align 8
+ %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+ %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
+ store double %canonicalized, double addrspace(1)* %gep2, align 8
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
+; GCN: flat_load_ushort [[V:v[0-9]+]],
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %v = load half, half addrspace(1)* %gep, align 2
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
+ store half %canonicalized, half addrspace(1)* %gep2, align 2
+ ret void
+}
+
declare float @llvm.canonicalize.f32(float) #0
declare double @llvm.canonicalize.f64(double) #0
declare half @llvm.canonicalize.f16(half) #0
@@ -485,3 +534,4 @@ declare float @llvm.maxnum.f32(float, float) #0
declare double @llvm.maxnum.f64(double, double) #0
attributes #0 = { nounwind readnone }
+attributes #1 = { "no-nans-fp-math"="true" }
OpenPOWER on IntegriCloud