summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp95
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll487
2 files changed, 573 insertions, 9 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index aaa9547fef0..2ba570b9ebb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4617,15 +4617,99 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
return SDValue();
}
+static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
+ if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+ return true;
+
+ return DAG.isKnownNeverNaN(Op);
+}
+
+static bool isCanonicalized(SDValue Op, const SISubtarget *ST,
+ unsigned MaxDepth=5) {
+ // If source is a result of another standard FP operation it is already in
+ // canonical form.
+
+ switch (Op.getOpcode()) {
+ default:
+ break;
+
+ // These will flush denorms if required.
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FSQRT:
+ case ISD::FCEIL:
+ case ISD::FFLOOR:
+ case ISD::FMA:
+ case ISD::FMAD:
+
+ case ISD::FCANONICALIZE:
+ return true;
+
+ case ISD::FP_ROUND:
+ return Op.getValueType().getScalarType() != MVT::f16 ||
+ ST->hasFP16Denormals();
+
+ case ISD::FP_EXTEND:
+ return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
+ ST->hasFP16Denormals();
+
+ case ISD::FP16_TO_FP:
+ case ISD::FP_TO_FP16:
+ return ST->hasFP16Denormals();
+
+ // It can/will be lowered or combined as a bit operation.
+ // Need to check their input recursively to handle.
+ case ISD::FNEG:
+ case ISD::FABS:
+ return (MaxDepth > 0) &&
+ isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1);
+
+ case ISD::FSIN:
+ case ISD::FCOS:
+ case ISD::FSINCOS:
+ return Op.getValueType().getScalarType() != MVT::f16;
+
+ // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
+ // For such targets need to check their input recursively.
+ // TODO: on GFX9+ we could return true without checking provided no-nan
+ // mode, since canonicalization is also used to quiet sNaNs.
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FMINNAN:
+ case ISD::FMAXNAN:
+
+ return (MaxDepth > 0) &&
+ isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) &&
+ isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1);
+
+ case ISD::ConstantFP: {
+ auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
+ return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
+ }
+ }
+ return false;
+}
+
// Constant fold canonicalize.
SDValue SITargetLowering::performFCanonicalizeCombine(
SDNode *N,
DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
- if (!CFP)
+
+ if (!CFP) {
+ SDValue N0 = N->getOperand(0);
+
+ bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+
+ if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
+ isCanonicalized(N0, getSubtarget()))
+ return N0;
+
return SDValue();
+ }
- SelectionDAG &DAG = DCI.DAG;
const APFloat &C = CFP->getValueAPF();
// Flush denormals to 0 if not enabled.
@@ -4718,13 +4802,6 @@ SDValue SITargetLowering::performIntMed3ImmCombine(
return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
}
-static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
- if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
- return true;
-
- return DAG.isKnownNeverNaN(Op);
-}
-
SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Op0,
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
new file mode 100644
index 00000000000..5383bbe71ae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -0,0 +1,487 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-EXCEPT -check-prefix=VI -check-prefix=GCN-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GCN-FLUSH %s
+
+; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %v = load float, float addrspace(1)* %gep, align 4
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
+; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fmul float %load, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
+; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fsub float 15.0, %load
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
+; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fadd float %load, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
+; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.sqrt.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
+; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.ceil.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
+; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.floor.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
+; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
+; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
+; GCN: flat_load_dword [[LOAD:v[0-9]+]],
+; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = call float @llvm.canonicalize.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
+; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fpext float %load to double
+ %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+ %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
+ store double %canonicalized, double addrspace(1)* %gep2, align 8
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
+; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %load = load half, half addrspace(1)* %gep, align 2
+ %v = fpext half %load to float
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+ store float %canonicalized, float addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
+; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+ %load = load double, double addrspace(1)* %gep, align 8
+ %v = fptrunc double %load to float
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
+ store float %canonicalized, float addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
+; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fptrunc float %load to half
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
+ store half %canonicalized, half addrspace(1)* %gep2, align 2
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
+; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
+; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
+; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
+; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
+; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
+; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
+ %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
+ %v = fptrunc <2 x float> %load to <2 x half>
+ %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
+ %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
+ store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
+ ret void
+}
+
+; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}}
+define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = fsub float -0.0, %load
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
+; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = fsub float -0.0, %v0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
+define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.fabs.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
+; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = tail call float @llvm.fabs.f32(float %v0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
+; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.sin.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
+; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.cos.f32(float %load)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
+; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
+; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %load = load half, half addrspace(1)* %gep, align 2
+ %v = tail call half @llvm.sin.f16(half %load)
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ store half %canonicalized, half addrspace(1)* %gep, align 2
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
+; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
+; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
+; GCN: flat_store_short v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
+ %load = load half, half addrspace(1)* %gep, align 2
+ %v = tail call half @llvm.cos.f16(half %load)
+ %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
+ store half %canonicalized, half addrspace(1)* %gep, align 2
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
+; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
+; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
+; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
+; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
+; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
+; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
+ %load = load float, float addrspace(1)* %gep, align 4
+ %v0 = fadd float %load, 0.0
+ %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ store float %canonicalized, float addrspace(1)* %gep, align 4
+ ret void
+}
+
+; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
+; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
+; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], [[V]]
+; GCN-NOT: 1.0
+define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
+ %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
+ %load = load double, double addrspace(1)* %gep, align 8
+ %v0 = fadd double %load, 0.0
+ %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
+ %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
+ store double %canonicalized, double addrspace(1)* %gep, align 8
+ ret void
+}
+
+; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee:
+; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
+entry:
+ %v = fmul float %arg, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ ret float %canonicalized
+}
+
+; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
+; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
+; GCN-NEXT: ; return
+; GCN-NOT: 1.0
+define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
+entry:
+ %v = fmul nnan float %arg, 15.0
+ %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+ ret float %canonicalized
+}
+
+declare float @llvm.canonicalize.f32(float) #0
+declare double @llvm.canonicalize.f64(double) #0
+declare half @llvm.canonicalize.f16(half) #0
+declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.sqrt.f32(float) #0
+declare float @llvm.ceil.f32(float) #0
+declare float @llvm.floor.f32(float) #0
+declare float @llvm.fma.f32(float, float, float) #0
+declare float @llvm.fmuladd.f32(float, float, float) #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.sin.f32(float) #0
+declare float @llvm.cos.f32(float) #0
+declare half @llvm.sin.f16(half) #0
+declare half @llvm.cos.f16(half) #0
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare double @llvm.maxnum.f64(double, double) #0
+
+attributes #0 = { nounwind readnone }
OpenPOWER on IntegriCloud