From 3003ba00a3260bdee71dd802bcfa970c3580e6bb Mon Sep 17 00:00:00 2001
From: Nicolai Haehnle <nhaehnle@gmail.com>
Date: Fri, 18 Mar 2016 16:24:20 +0000
Subject: AMDGPU: use ComplexPattern for offsets in
 llvm.amdgcn.buffer.load/store.format

Summary:
We cannot easily deduce that an offset is in an SGPR, but the Mesa frontend
cannot easily make use of an explicit soffset parameter either. Furthermore,
it is likely that in the future, LLVM will be in a better position than the
frontend to choose an SGPR offset if possible.

Since there aren't any frontend uses of these intrinsics in upstream
repositories yet, I would like to take this opportunity to change the
intrinsic signatures to a single offset parameter, which is then selected
to immediate offsets or voffsets using a ComplexPattern.

Reviewers: arsenm, tstellarAMD, mareko

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18218

llvm-svn: 263790
---
 .../AMDGPU/llvm.amdgcn.buffer.load.format.ll       | 71 +++++++++++++++++-----
 .../AMDGPU/llvm.amdgcn.buffer.store.format.ll      | 48 +++++++--------
 2 files changed, 80 insertions(+), 39 deletions(-)

(limited to 'llvm/test')

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
index d8ee315cfb8..c6222f426b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
@@ -2,15 +2,15 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}buffer_load:
-;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], s4
-;CHECK: buffer_load_format_xyzw v[4:7], s[0:3], s4 glc
-;CHECK: buffer_load_format_xyzw v[8:11], s[0:3], s4 slc
+;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], 0
+;CHECK: buffer_load_format_xyzw v[4:7], s[0:3], 0 glc
+;CHECK: buffer_load_format_xyzw v[8:11], s[0:3], 0 slc
 ;CHECK: s_waitcnt
-define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg, i32 inreg) #0 {
+define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 {
 main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 0, i1 0)
-  %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 1, i1 0)
-  %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 0, i1 1)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
   %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
   %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
   %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
@@ -18,11 +18,42 @@ main_body:
 }
 
 ;CHECK-LABEL: {{^}}buffer_load_immoffs:
-;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], s4 offset:42
+;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], 0 offset:42
 ;CHECK: s_waitcnt
-define <4 x float> @buffer_load_immoffs(<4 x i32> inreg, i32 inreg) #0 {
+define <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 {
 main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 42, i32 0, i32 0, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
+;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 61 offset:4095
+;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff
+;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS1]] offset:4093
+;CHECK: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff
+;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS2]] offset:1
+;CHECK: s_waitcnt
+define <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 {
+main_body:
+  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0)
+  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0)
+  %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0)
+  %d.3 = fadd <4 x float> %d.0, %d.1
+  %data = fadd <4 x float> %d.2, %d.3
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse:
+;CHECK: s_movk_i32 [[OFS:s[0-9]+]], 0xfff
+;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS]] offset:65
+;CHECK-NOT: s_mov
+;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS]] offset:81
+;CHECK: s_waitcnt
+define <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) #0 {
+main_body:
+  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0)
+  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0)
+  %data = fadd <4 x float> %d.0, %d.1
   ret <4 x float> %data
 }
 
@@ -31,7 +62,7 @@ main_body:
 ;CHECK: s_waitcnt
 define <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 {
 main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 %1, i32 0, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
   ret <4 x float> %data
 }
 
@@ -40,7 +71,17 @@ main_body:
 ;CHECK: s_waitcnt
 define <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 {
 main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 0, i32 %1, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: s_waitcnt
+define <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 {
+main_body:
+  %ofs = add i32 %1, 58
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
   ret <4 x float> %data
 }
 
@@ -49,7 +90,7 @@ main_body:
 ;CHECK: s_waitcnt
 define <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 {
 main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 %1, i32 %2, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
   ret <4 x float> %data
 }
 
@@ -59,11 +100,11 @@ main_body:
 ;CHECK: s_waitcnt
 define <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 {
 main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 %2, i32 %1, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
   ret <4 x float> %data
 }
 
-declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i1, i1) #1
 
 attributes #0 = { "ShaderType"="0" }
 attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
index 87e6e6da32c..7e254efdcca 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
@@ -2,55 +2,55 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}buffer_store:
-;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], s4
-;CHECK: buffer_store_format_xyzw v[4:7], s[0:3], s4 glc
-;CHECK: buffer_store_format_xyzw v[8:11], s[0:3], s4 slc
-define void @buffer_store(<4 x i32> inreg, i32 inreg, <4 x float>, <4 x float>, <4 x float>) #0 {
+;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], 0
+;CHECK: buffer_store_format_xyzw v[4:7], s[0:3], 0 glc
+;CHECK: buffer_store_format_xyzw v[8:11], s[0:3], 0 slc
+define void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 {
 main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 1, i1 0)
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %4, <4 x i32> %0, i32 %1, i32 0, i32 0, i32 0, i1 0, i1 1)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
   ret void
 }
 
 ;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], s4 offset:42
-define void @buffer_store_immoffs(<4 x i32> inreg, i32 inreg, <4 x float>) #0 {
+;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], 0 offset:42
+define void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 {
 main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 %1, i32 42, i32 0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
   ret void
 }
 
 ;CHECK-LABEL: {{^}}buffer_store_idx:
 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-define void @buffer_store_idx(<4 x i32> inreg, i32 inreg, <4 x float>, i32) #0 {
+define void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 %3, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
   ret void
 }
 
 ;CHECK-LABEL: {{^}}buffer_store_ofs:
 ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen
-define void @buffer_store_ofs(<4 x i32> inreg, i32 inreg, <4 x float>, i32) #0 {
+define void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 {
 main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 %3, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
   ret void
 }
 
 ;CHECK-LABEL: {{^}}buffer_store_both:
 ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
-define void @buffer_store_both(<4 x i32> inreg, i32 inreg, <4 x float>, i32, i32) #0 {
+define void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 %3, i32 %4, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
   ret void
 }
 
 ;CHECK-LABEL: {{^}}buffer_store_both_reversed:
 ;CHECK: v_mov_b32_e32 v6, v4
 ;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
-define void @buffer_store_both_reversed(<4 x i32> inreg, i32 inreg, <4 x float>, i32, i32) #0 {
+define void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 {
 main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 %4, i32 %3, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
   ret void
 }
 
@@ -62,16 +62,16 @@ main_body:
 ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
 ;CHECK: s_waitcnt vmcnt(0)
 ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
-define void @buffer_store_wait(<4 x i32> inreg, i32 inreg, <4 x float>, i32, i32, i32) #0 {
+define void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 {
 main_body:
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 %3, i32 0, i1 0, i1 0)
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 0, i32 0, i32 %4, i32 0, i1 0, i1 0)
-  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 0, i32 0, i32 %5, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
   ret void
 }
 
-declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i32, i32, i1, i1) #2
+declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format(<4 x i32>, i32, i32, i1, i1) #2
 
 attributes #0 = { "ShaderType"="0" }
 attributes #1 = { nounwind }
-- 
cgit v1.2.3