LoopUnroll: Allow analyzing intrinsic call costs

I'm not sure why the code here is skipping calls since TTI does try to do something for general calls, but it at least should allow intrinsics. Skip intrinsics that should not be omitted as calls, which is by far the most common case on AMDGPU. llvm-svn: 335645
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-06-26 18:51:17 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-06-26 18:51:17 +0000
commit: 2c1a570aabc184e93f160992c3fe3eab8f884f74 (patch)
tree: 4accff69a6dbb2c0038ffeab5109107289a83133 /llvm/test/Transforms/LoopUnroll/AMDGPU
parent: e745cf9bf31334cc89f0766269f904117f769b68 (diff)
download: bcm5719-llvm-2c1a570aabc184e93f160992c3fe3eab8f884f74.tar.gz
bcm5719-llvm-2c1a570aabc184e93f160992c3fe3eab8f884f74.zip
1 files changed, 77 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll
new file mode 100644
index 00000000000..9ca109c4f74
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll
@@ -0,0 +1,77 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -unroll-threshold=100 -unroll-peel-count=0 -unroll-allow-partial=false -unroll-max-iteration-count-to-analyze=16 < %s | FileCheck %s
+
+; CHECK-LABEL: @test_intrinsic_call_cost(
+; CHECK-NOT: br i1
+define amdgpu_kernel void @test_intrinsic_call_cost(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture %in) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
+  %arrayidx.in = getelementptr inbounds float, float addrspace(1)* %in, i32 %indvars.iv
+  %arrayidx.out = getelementptr inbounds float, float addrspace(1)* %out, i32 %indvars.iv
+  %load = load float, float addrspace(1)* %arrayidx.in
+  %call = call float @llvm.minnum.f32(float %load, float 1.0);
+  %fmul = fmul float %call, %sum.02
+  store float %fmul, float addrspace(1)* %arrayidx.out
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @test_func_call_cost(
+; CHECK: br i1 %exitcond
+define amdgpu_kernel void @test_func_call_cost(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture %in) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
+  %arrayidx.in = getelementptr inbounds float, float addrspace(1)* %in, i32 %indvars.iv
+  %arrayidx.out = getelementptr inbounds float, float addrspace(1)* %out, i32 %indvars.iv
+  %load = load float, float addrspace(1)* %arrayidx.in
+  %fptr = load float(float, float)*, float(float, float )* addrspace(4)* null
+  %call = tail call float %fptr(float %load, float 1.0)
+  %fmul = fmul float %call, %sum.02
+  store float %fmul, float addrspace(1)* %arrayidx.out
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @test_indirect_call_cost(
+; CHECK: br i1 %exitcond
+define amdgpu_kernel void @test_indirect_call_cost(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture %in) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
+  %arrayidx.in = getelementptr inbounds float, float addrspace(1)* %in, i32 %indvars.iv
+  %arrayidx.out = getelementptr inbounds float, float addrspace(1)* %out, i32 %indvars.iv
+  %load = load float, float addrspace(1)* %arrayidx.in
+  %min = call float @func(float %load, float 1.0);
+  %fmul = fmul float %min, %sum.02
+  store float %fmul, float addrspace(1)* %arrayidx.out
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @llvm.minnum.f32(float, float) #1
+declare float @func(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-06-26 18:51:17 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-06-26 18:51:17 +0000
commit	2c1a570aabc184e93f160992c3fe3eab8f884f74 (patch)
tree	4accff69a6dbb2c0038ffeab5109107289a83133 /llvm/test/Transforms/LoopUnroll/AMDGPU
parent	e745cf9bf31334cc89f0766269f904117f769b68 (diff)
download	bcm5719-llvm-2c1a570aabc184e93f160992c3fe3eab8f884f74.tar.gz bcm5719-llvm-2c1a570aabc184e93f160992c3fe3eab8f884f74.zip