diff options
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 20 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll | 459 | 
2 files changed, 472 insertions, 7 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index f0b9c9e9455..53fb9e3cf1c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -372,13 +372,18 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(    return true;  } -static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { +static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {    const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);    if (!CNum) -    return false; +    return HasDenormals; + +  if (UnsafeDiv) +    return true; + +  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);    // Reciprocal f32 is handled separately without denormals. -  return UnsafeDiv || CNum->isExactlyValue(+1.0); +  return HasDenormals ^ IsOne;  }  // Insert an intrinsic for fast fdiv for safe math situations where we can @@ -404,7 +409,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {                                        FMF.allowReciprocal();    // With UnsafeDiv node will be optimized to just rcp and mul. -  if (ST->hasFP32Denormals() || UnsafeDiv) +  if (UnsafeDiv)      return false;    IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); @@ -418,6 +423,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {    Value *NewFDiv = nullptr; +  bool HasDenormals = ST->hasFP32Denormals();    if (VectorType *VT = dyn_cast<VectorType>(Ty)) {      NewFDiv = UndefValue::get(VT); @@ -428,7 +434,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {        Value *DenEltI = Builder.CreateExtractElement(Den, I);        Value *NewElt; -      if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { +      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {          NewElt = Builder.CreateFDiv(NumEltI, DenEltI);        } else {          NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); @@ -437,7 +443,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {        NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);      }    } else { -    if (!shouldKeepFDivF32(Num, UnsafeDiv)) +    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))        NewFDiv = Builder.CreateCall(Decl, { Num, Den });    } @@ -447,7 +453,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {      FDiv.eraseFromParent();    } -  return true; +  return !!NewFDiv;  }  static bool hasUnsafeFPMath(const Function &F) { diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll new file mode 100644 index 00000000000..b5d008e9d19 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -0,0 +1,459 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s + +; GCN-LABEL: {{^}}div_1_by_x_25ulp: +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc +; GCN-DENORM:     v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]] +; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] +; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] + +; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]] + +; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off +define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %div = fdiv float 1.000000e+00, %load, !fpmath !0 +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_minus_1_by_x_25ulp: +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc +; GCN-DENORM:     v_mul_f32_e64 [[PRESCALED:v[0-9]+]], [[VAL]], -[[SCALE]] +; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] +; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] + +; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]] + +; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off +define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %div = fdiv float -1.000000e+00, %load, !fpmath !0 +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_1_by_minus_x_25ulp: +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc +; GCN-DENORM:     v_mul_f32_e64 [[PRESCALED:v[0-9]+]], -[[VAL]], [[SCALE]] +; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] +; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] + +; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]] + +; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off +define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %neg = fsub float -0.000000e+00, %load +  %div = fdiv float 1.000000e+00, %neg, !fpmath !0 +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_minus_1_by_minus_x_25ulp: +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc +; GCN-DENORM:     v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]] +; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] +; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] + +; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]] + +; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off +define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %neg = fsub float -0.000000e+00, %load +  %div = fdiv float -1.000000e+00, %neg, !fpmath !0 +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp: +; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 + +; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]] +; GCN-FLUSH:      v_rcp_f32_e32 +; GCN-FLUSH:      v_rcp_f32_e32 +; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]] +; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off +define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) { +  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 +  %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0 +  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 +  ret void +} + +; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp: +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 + +; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]] +; GCN-FLUSH:      v_rcp_f32_e64 +; GCN-FLUSH:      v_rcp_f32_e64 +; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]] +define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) { +  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 +  %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %load, !fpmath !0 +  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 +  ret void +} + +; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp: +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 + +; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]] +; GCN-FLUSH:      v_rcp_f32_e64 +; GCN-FLUSH:      v_rcp_f32_e64 +; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]] +; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off +define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) { +  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 +  %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load +  %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %neg, !fpmath !0 +  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 +  ret void +} + +; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp: +; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}} +; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 +; GCN-DENORM-DAG: v_mul_f32_e32 + +; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]] +; GCN-FLUSH:      v_rcp_f32_e32 +; GCN-FLUSH:      v_rcp_f32_e32 +; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]] +; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off +define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) { +  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 +  %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load +  %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %neg, !fpmath !0 +  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 +  ret void +} + +; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp: +; GCN-DAG:        v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DAG:        v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 + +; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc + +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]] +; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]] + +; GCN-DENORM-DAG: v_div_fmas_f32 +; GCN-DENORM-DAG: v_div_fmas_f32 +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} + +; GCN-FLUSH-DAG:  v_rcp_f32_e32 +; GCN-FLUSH-DAG:  v_rcp_f32_e64 + +; GCN-NOT:        v_cmp_gt_f32_e64 +; GCN-NOT:        v_cndmask_b32_e32 +; GCN-FLUSH-NOT:  v_div + +; GCN:            global_store_dwordx4 +define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) { +  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 +  %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %load, !fpmath !0 +  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 +  ret void +} + +; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: +; GCN-DAG:        v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-DAG:        v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_rcp_f32_e32 + +; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc +; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]] +; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc + +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]] +; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]] + +; GCN-DENORM-DAG: v_div_fmas_f32 +; GCN-DENORM-DAG: v_div_fmas_f32 +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} + +; GCN-FLUSH-DAG:  v_rcp_f32_e32 +; GCN-FLUSH-DAG:  v_rcp_f32_e64 + +; GCN-NOT:        v_cmp_gt_f32_e64 +; GCN-NOT:        v_cndmask_b32_e32 +; GCN-FLUSH-NOT:  v_div + +; GCN:            global_store_dwordx4 +define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) { +  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16 +  %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load +  %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %neg, !fpmath !0 +  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16 +  ret void +} + +; GCN-LABEL: {{^}}div_v_by_x_25ulp: +; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}} + +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM:     v_div_fmas_f32 +; GCN-DENORM:     v_div_fixup_f32 [[OUT:v[0-9]+]], + +; GCN-FLUSF-DAG:  v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 +; GCN-FLUSH-DAG:  v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000 +; GCN-FLUSH-DAG:  v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]] +; GCN-FLUSH-DAG:  v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc +; GCN-FLUSH:      v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]] +; GCN-FLUSH:      v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]] +; GCN-FLUSH:      v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]] + +; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off +define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %div = fdiv float %num, %load, !fpmath !0 +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_1_by_x_fast: +; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] +; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %div = fdiv fast float 1.000000e+00, %load +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_minus_1_by_x_fast: +; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] +; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %div = fdiv fast float -1.000000e+00, %load +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_1_by_minus_x_fast: +; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] +; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %neg = fsub float -0.000000e+00, %load +  %div = fdiv fast float 1.000000e+00, %neg +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast: +; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] +; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %neg = fsub float -0.000000e+00, %load +  %div = fdiv fast float -1.000000e+00, %neg +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded: +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM:     v_div_fmas_f32 +; GCN-DENORM:     v_div_fixup_f32 + +; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] +; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %div = fdiv float 1.000000e+00, %load +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded: +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM:     v_div_fmas_f32 +; GCN-DENORM:     v_div_fixup_f32 + +; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] +; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %div = fdiv float -1.000000e+00, %load +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded: +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM:     v_div_fmas_f32 +; GCN-DENORM:     v_div_fixup_f32 + +; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]] +; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %neg = fsub float -0.000000e+00, %load +  %div = fdiv float 1.000000e+00, %neg +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded: +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM-DAG: v_rcp_f32_e32 +; GCN-DENORM-DAG: v_div_scale_f32 +; GCN-DENORM:     v_div_fmas_f32 +; GCN-DENORM:     v_div_fixup_f32 + +; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0 +; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]] +; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off +define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) { +  %load = load float, float addrspace(1)* %arg, align 4 +  %neg = fsub float -0.000000e+00, %load +  %div = fdiv float -1.000000e+00, %neg +  store float %div, float addrspace(1)* %arg, align 4 +  ret void +} + +!0 = !{float 2.500000e+00}  | 

