diff options
| author | Alexander Timofeev <Alexander.Timofeev@amd.com> | 2016-11-03 14:37:13 +0000 |
|---|---|---|
| committer | Alexander Timofeev <Alexander.Timofeev@amd.com> | 2016-11-03 14:37:13 +0000 |
| commit | f867a40bf60ad813560fe4cc3d2cc100472ffef4 (patch) | |
| tree | e888ef6d503dc980fc536452f72a71ab5182b7af /llvm/test/CodeGen | |
| parent | 73aba6229f7f6cdc1aa5b107518684a95da4851e (diff) | |
| download | bcm5719-llvm-f867a40bf60ad813560fe4cc3d2cc100472ffef4.tar.gz bcm5719-llvm-f867a40bf60ad813560fe4cc3d2cc100472ffef4.zip | |
[AMDGPU][CodeGen] To improve CGEMM performance: combine LDS reads.
hange explores the fact that LDS reads may be reordered even if access
the same location.
Prior the change, algorithm immediately stops as soon as any memory
access encountered between loads that are expected to be merged
together. Although, Read-After-Read conflict cannot affect execution
correctness.
Improves hcBLAS CGEMM manually loop-unrolled kernels performance by 44%.
Also improvement expected on any massive sequences of reads from LDS.
Differential Revision: https://reviews.llvm.org/D25944
llvm-svn: 285919
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/ds_read2.ll | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 6e30cff9609..9a313230e30 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -493,6 +493,46 @@ define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) ret void } +; SI-LABEL: ds_read_diff_base_interleaving +; SI-NOT: ds_read_b32 +define amdgpu_kernel void @ds_read_diff_base_interleaving( + float addrspace(1)* nocapture %arg, + [4 x [4 x float]] addrspace(3)* %arg1, + [4 x [4 x float]] addrspace(3)* %arg2, + [4 x [4 x float]] addrspace(3)* %arg3, + [4 x [4 x float]] addrspace(3)* %arg4) #1 { +bb: + %tmp = getelementptr float, float addrspace(1)* %arg, i64 10 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 0 + %tmp8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 0, i32 %tmp5 + %tmp9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 0 + %tmp10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 0, i32 %tmp5 + %tmp11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 1 + %tmp12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 1, i32 %tmp5 + %tmp13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 1 + %tmp14 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 1, i32 %tmp5 + %tmp15 = load float, float addrspace(3)* %tmp7 + %tmp16 = load float, float addrspace(3)* %tmp8 + %tmp17 = fmul float %tmp15, %tmp16 + %tmp18 = fadd float 2.000000e+00, %tmp17 + %tmp19 = load float, float addrspace(3)* %tmp9 + %tmp20 = load float, float addrspace(3)* %tmp10 + %tmp21 = fmul float %tmp19, %tmp20 + %tmp22 = fsub float %tmp18, %tmp21 + %tmp23 = load float, float addrspace(3)* %tmp11 + %tmp24 = load float, float addrspace(3)* %tmp12 + %tmp25 = fmul float %tmp23, %tmp24 + %tmp26 = fsub float %tmp22, %tmp25 + %tmp27 = load float, float addrspace(3)* %tmp13 + %tmp28 = load float, float addrspace(3)* %tmp14 + %tmp29 = fmul float %tmp27, %tmp28 + %tmp30 = fsub float %tmp26, %tmp29 + store float %tmp30, float addrspace(1)* %tmp + ret void +} + ; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workgroup.id.x() #1 |

