diff options
author | Jan Vesely <jan.vesely@rutgers.edu> | 2020-02-04 19:27:19 -0500 |
---|---|---|
committer | Hans Wennborg <hans@chromium.org> | 2020-02-10 14:23:15 +0100 |
commit | b73942dbc144c11dc94fd32a7d8025a22e7e1d6b (patch) | |
tree | 6cb8a9d4ae0eb1b39319fd14187d9ae4d0cd370d /llvm/test/CodeGen/AMDGPU | |
parent | 84cda4cceabdfec4f130bfafe7bbd050aa65b2ec (diff) | |
download | bcm5719-llvm-b73942dbc144c11dc94fd32a7d8025a22e7e1d6b.tar.gz bcm5719-llvm-b73942dbc144c11dc94fd32a7d8025a22e7e1d6b.zip |
AMDGPU/EG,CM: Implement fsqrt using recip(rsqrt(x)) instead of x * rsqrt(x)
The old version might be faster on EG (RECIP_IEEE is Trans only),
but it'd need extra corner case checks.
This gives correct corner case behaviour and saves a register.
Fixes OCL CTS sqrt test (1-thread, scalar) on Turks.
Reviewer: arsenm
Differential Revision: https://reviews.llvm.org/D74017
(cherry picked from commit e6686adf8a743564f0c455c34f04752ab08cf642)
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fsqrt.ll | 38 |
1 files changed, 24 insertions, 14 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.ll index 6bd9a0db14f..ba5f79d9bf5 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.ll @@ -27,8 +27,8 @@ define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float ad ; FUNC-LABEL: {{^}}s_sqrt_f32: ; GCN: v_sqrt_f32_e32 -; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z -; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS +; R600: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].Z +; R600: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 { entry: %fdiv = call float @llvm.sqrt.f32(float %in) @@ -40,10 +40,10 @@ entry: ; GCN: v_sqrt_f32_e32 ; GCN: v_sqrt_f32_e32 -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].W +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].X +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}} define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) @@ -57,14 +57,14 @@ entry: ; GCN: v_sqrt_f32_e32 ; GCN: v_sqrt_f32_e32 -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Y +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}} +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].W +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}} +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[4].X +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}} define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { entry: %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) @@ -134,6 +134,16 @@ entry: ret void } +; FUNC-LABEL: {{^}}recip_sqrt: +; R600: RECIPSQRT_IEEE +; R600-NOT: RECIP_IEEE +define amdgpu_kernel void @recip_sqrt(float addrspace(1)* %out, float %src) nounwind { + %sqrt = call float @llvm.sqrt.f32(float %src) + %recipsqrt = fdiv fast float 1.0, %sqrt + store float %recipsqrt, float addrspace(1)* %out, align 4 + ret void +} + declare float @llvm.sqrt.f32(float %in) #0 declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0 declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0 |