diff options
author | Nicolai Haehnle <nhaehnle@gmail.com> | 2016-04-12 21:18:10 +0000 |
---|---|---|
committer | Nicolai Haehnle <nhaehnle@gmail.com> | 2016-04-12 21:18:10 +0000 |
commit | df77c9ada4c172def64f0a3514df6a34a223c409 (patch) | |
tree | e138e971963abf92c92ad8ea37836c317ed76e72 /llvm/test/CodeGen | |
parent | c86af3345c9e3d3814c73df644d58c11eba38922 (diff) | |
download | bcm5719-llvm-df77c9ada4c172def64f0a3514df6a34a223c409.tar.gz bcm5719-llvm-df77c9ada4c172def64f0a3514df6a34a223c409.zip |
AMDGPU: add llvm.amdgcn.buffer.load/store intrinsics
Summary:
They correspond to BUFFER_LOAD/STORE_DWORD[_X2,X3,X4] and mostly behave like
llvm.amdgcn.buffer.load/store.format. They will be used by Mesa for SSBO and
atomic counters at least when robust buffer access behavior is desired.
(These instructions perform no format conversion and do buffer range checking
per component.)
As a side effect of sharing patterns with llvm.amdgcn.buffer.store.format,
it has become trivial to add support for the f32 and v2f32 variants of that
intrinsic, so the patch does so.
Also DAG-ify (and fix) some tests that I noticed intermittent failures in
while developing this patch.
Some tests were (temporarily) adjusted for the required mayLoad/hasSideEffects
changes to the BUFFER_STORE_DWORD* instructions. See also
http://reviews.llvm.org/D18291.
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D18292
llvm-svn: 266126
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/captured-frame-index.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 4 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll | 108 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll | 18 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll | 95 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/sminmax.ll | 24 |
8 files changed, 242 insertions, 22 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll index 7e2b98e7c5a..21c8af4fafa 100644 --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}stored_fi_to_lds: -; GCN: s_load_dword [[LDSPTR:s[0-9]+]] -; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]] +; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO1]] ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]] ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]] diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 85dae75645f..67798864a0c 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -33,8 +33,8 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> ; SI-NOT: bfe ; SI-NOT: v_cvt_f32_ubyte3_e32 ; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll index 992ab5b389e..3c5b59c321a 100644 --- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll @@ -32,7 +32,6 @@ define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrs ; GCN-LABEL: {{^}}bitcast_int_to_fpvector_extract_0: ; GCN: buffer_load_dwordx2 ; GCN: v_add_i32 -; GCN: v_addc_u32 ; GCN: buffer_store_dword define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) { %a = load i64, i64 addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll new file mode 100644 index 00000000000..3d5f69f0745 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -0,0 +1,108 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0 +;CHECK: buffer_load_dwordx4 v[4:7], s[0:3], 0 glc +;CHECK: buffer_load_dwordx4 v[8:11], s[0:3], 0 slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) + %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) + %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0 offset:42 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff +;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], [[OFFSET]] offset:1 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_idx: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 58 + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both: +;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both_reversed: +;CHECK: v_mov_b32_e32 v2, v0 +;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x1: +;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_x2: +;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) + ret <2 x float> %data +} + +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll index 0af9a3e9f13..63dcd841383 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -70,6 +70,24 @@ main_body: ret void } +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll new file mode 100644 index 00000000000..c935f9b520a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -0,0 +1,95 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 +;CHECK: buffer_store_dwordx4 v[4:7], s[0:3], 0 glc +;CHECK: buffer_store_dwordx4 v[8:11], s[0:3], 0 slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both_reversed: +;CHECK: v_mov_b32_e32 v6, v4 +;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) expcnt(0) +;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +main_body: + call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +main_body: + call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 34b85cdf632..6e2bd96064e 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -11,8 +11,8 @@ declare void @llvm.amdgcn.s.barrier() #1 ; FUNC-LABEL: @reorder_local_load_global_store_local_load ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4 +; CI-NEXT: buffer_store_dword ; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8 -; CI: buffer_store_dword define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 { %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 @@ -71,9 +71,9 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace } ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load -; CI: buffer_store_dword ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}} ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}} +; CI-DAG: buffer_store_dword ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2 ; CI: buffer_store_dword @@ -184,11 +184,11 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa } ; FUNC-LABEL: @reorder_global_offsets +; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 -; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400 +; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404 ; CI: buffer_store_dword ; CI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll index e646605f7da..bea3ac15f55 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll @@ -46,11 +46,11 @@ define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind } ; FUNC-LABEL: {{^}}v_abs_v2i32: -; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] ; GCN: v_add_i32 ; GCN: v_add_i32 @@ -97,15 +97,15 @@ define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind } ; FUNC-LABEL: {{^}}v_abs_v4i32: -; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] -; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]] +; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] -; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]] +; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]] ; GCN: v_add_i32 ; GCN: v_add_i32 |