summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJay Foad <jay.foad@gmail.com>2019-09-06 10:07:28 +0000
committerJay Foad <jay.foad@gmail.com>2019-09-06 10:07:28 +0000
commit6c0204c794ba143ce6817f695e03fd9f49cd4ed8 (patch)
tree1ce58102bd39e3c4b6e0962fa1f4321975755608
parent68df9dc09818289ac5b40d0916bb479a358a8e18 (diff)
downloadbcm5719-llvm-6c0204c794ba143ce6817f695e03fd9f49cd4ed8.tar.gz
bcm5719-llvm-6c0204c794ba143ce6817f695e03fd9f49cd4ed8.zip
[AMDGPU] Mark s_barrier as having side effects but not accessing memory.
Summary: This fixes poor scheduling in a function containing a barrier and a few load instructions. Without this fix, ScheduleDAGInstrs::buildSchedGraph adds an artificial edge in the dependency graph from the barrier instruction to the exit node representing live-out latency, with a latency of about 500 cycles. Because of this it thinks the critical path through the graph also has a latency of about 500 cycles. And because of that it does not think that any of the load instructions are on the critical path, so it schedules them with no regard for their (80 cycle) latency, which gives poor results. Reviewers: arsenm, dstuttard, tpr, nhaehnle Subscribers: kzhuravl, jvesely, wdng, yaxunl, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67218 llvm-svn: 371192
-rw-r--r--llvm/include/llvm/IR/IntrinsicsAMDGPU.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll56
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/schedule-barrier.mir46
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_mac_f16.ll17
-rw-r--r--llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll50
7 files changed, 113 insertions, 68 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 14605a433e5..29a3d2460eb 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -206,7 +206,7 @@ def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
[ImmArg<0>, IntrNoMem, IntrHasSideEffects]>;
def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
- Intrinsic<[], [], [IntrConvergent]>;
+ Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent]>;
def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
Intrinsic<[], [], [IntrConvergent]>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index d2d783ba76c..34082eb0cd4 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1076,8 +1076,6 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
[(int_amdgcn_s_barrier)]> {
let SchedRW = [WriteBarrier];
let simm16 = 0;
- let mayLoad = 1;
- let mayStore = 1;
let isConvergent = 1;
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 2dd7e20c00c..0bf3125b22b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -16,9 +16,9 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT0-NEXT: v_mov_b32_e32 v2, 0
; VARIANT0-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
-; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; VARIANT0-NEXT: s_barrier
+; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
@@ -37,8 +37,8 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT1-NEXT: v_mov_b32_e32 v2, 0
; VARIANT1-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
-; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT1-NEXT: s_barrier
+; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT1-NEXT: s_waitcnt expcnt(0)
@@ -50,46 +50,46 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT2-LABEL: test_barrier:
; VARIANT2: ; %bb.0: ; %entry
; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c
+; VARIANT2-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VARIANT2-NEXT: s_waitcnt lgkmcnt(0)
-; VARIANT2-NEXT: v_mov_b32_e32 v2, s3
-; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
-; VARIANT2-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; VARIANT2-NEXT: global_store_dword v[1:2], v0, off
+; VARIANT2-NEXT: v_mov_b32_e32 v4, s3
+; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s0
+; VARIANT2-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
+; VARIANT2-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; VARIANT2-NEXT: global_store_dword v[3:4], v0, off
+; VARIANT2-NEXT: v_mov_b32_e32 v5, s3
+; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1
+; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
; VARIANT2-NEXT: s_waitcnt vmcnt(0)
; VARIANT2-NEXT: s_barrier
-; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0
-; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
-; VARIANT2-NEXT: v_mov_b32_e32 v0, s3
-; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
-; VARIANT2-NEXT: global_load_dword v0, v[3:4], off
+; VARIANT2-NEXT: global_load_dword v0, v[0:1], off
; VARIANT2-NEXT: s_waitcnt vmcnt(0)
-; VARIANT2-NEXT: global_store_dword v[1:2], v0, off
+; VARIANT2-NEXT: global_store_dword v[3:4], v0, off
; VARIANT2-NEXT: s_endpgm
;
; VARIANT3-LABEL: test_barrier:
; VARIANT3: ; %bb.0: ; %entry
; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c
+; VARIANT3-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VARIANT3-NEXT: s_waitcnt lgkmcnt(0)
-; VARIANT3-NEXT: v_mov_b32_e32 v2, s3
-; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
-; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; VARIANT3-NEXT: global_store_dword v[1:2], v0, off
-; VARIANT3-NEXT: s_barrier
-; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0
-; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3
-; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
-; VARIANT3-NEXT: v_mov_b32_e32 v0, s3
+; VARIANT3-NEXT: v_mov_b32_e32 v4, s3
+; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s0
+; VARIANT3-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
-; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
-; VARIANT3-NEXT: global_load_dword v0, v[3:4], off
+; VARIANT3-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; VARIANT3-NEXT: global_store_dword v[3:4], v0, off
+; VARIANT3-NEXT: v_mov_b32_e32 v5, s3
+; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1
+; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
+; VARIANT3-NEXT: s_barrier
+; VARIANT3-NEXT: global_load_dword v0, v[0:1], off
; VARIANT3-NEXT: s_waitcnt vmcnt(0)
-; VARIANT3-NEXT: global_store_dword v[1:2], v0, off
+; VARIANT3-NEXT: global_store_dword v[3:4], v0, off
; VARIANT3-NEXT: s_endpgm
entry:
%tmp = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index a070488a4bc..f6611ec26a8 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -38,18 +38,16 @@ entry:
; GCN-LABEL: {{^}}local_memory_two_objects:
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
+; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-
-; GCN: s_barrier
-
; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
+; GCN: s_barrier
+
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
-
-; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
new file mode 100644
index 00000000000..a72a406ff09
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
@@ -0,0 +1,46 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+# Check that the high latency loads are both scheduled first, before the
+# multiplies, despite the presence of a barrier in the function.
+# CHECK: BUFFER_LOAD_DWORD_OFFSET
+# CHECK: BUFFER_LOAD_DWORD_OFFSET
+# CHECK: V_MUL_LO_U32
+# CHECK: V_MUL_LO_U32
+name: test
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+
+ undef %43.sub3:vreg_128 = COPY $vgpr9
+ undef %42.sub2:vreg_128 = COPY $vgpr8
+ undef %41.sub1:vreg_128 = COPY $vgpr7
+ undef %26.sub0:vreg_128 = COPY $vgpr6
+ undef %46.sub3:vreg_128 = COPY $vgpr5
+ undef %45.sub2:vreg_128 = COPY $vgpr4
+ undef %44.sub1:vreg_128 = COPY $vgpr3
+ undef %32.sub0:vreg_128 = COPY $vgpr2
+ undef %38.sub1:vreg_64 = COPY $vgpr1
+ %38.sub0:vreg_64 = COPY $vgpr0
+
+ S_BARRIER
+
+ undef %33.sub0:sgpr_128 = V_READFIRSTLANE_B32 %32.sub0, implicit $exec
+ %33.sub1:sgpr_128 = V_READFIRSTLANE_B32 %44.sub1, implicit $exec
+ %33.sub2:sgpr_128 = V_READFIRSTLANE_B32 %45.sub2, implicit $exec
+ %33.sub3:sgpr_128 = V_READFIRSTLANE_B32 %46.sub3, implicit $exec
+ %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, implicit $exec
+ %39:vgpr_32 = V_MUL_LO_U32 %15, %15, implicit $exec
+
+ undef %27.sub0:sgpr_128 = V_READFIRSTLANE_B32 %26.sub0, implicit $exec
+ %27.sub1:sgpr_128 = V_READFIRSTLANE_B32 %41.sub1, implicit $exec
+ %27.sub2:sgpr_128 = V_READFIRSTLANE_B32 %42.sub2, implicit $exec
+ %27.sub3:sgpr_128 = V_READFIRSTLANE_B32 %43.sub3, implicit $exec
+ %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, implicit $exec
+ %40:vgpr_32 = V_MUL_LO_U32 %19, %19, implicit $exec
+
+ %23:vgpr_32 = V_ADD_U32_e32 %39, %40, implicit $exec
+ GLOBAL_STORE_DWORD %38, %23, 0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index f2c2cc847c7..e503fbb51ad 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -304,14 +304,17 @@ entry:
; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+
+; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+
; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
index c8777f441f2..4cbd8914772 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -5,8 +5,8 @@
; GCN-LABEL: barrier_vmcnt_global:
; GFX8: flat_load_dword
; GFX9_10: global_load_dword
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
+; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9_10: s_waitcnt vmcnt(0){{$}}
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) {
bb:
@@ -28,9 +28,9 @@ bb:
; GCN-LABEL: barrier_vscnt_global:
; GFX8: flat_store_dword
; GFX9_10: global_store_dword
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX9-NEXT: s_waitcnt vmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9: s_waitcnt vmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) {
bb:
@@ -54,9 +54,9 @@ bb:
; GCN-LABEL: barrier_vmcnt_vscnt_global:
; GFX8: flat_load_dword
; GFX9_10: global_load_dword
-; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX9_10-NEXT: s_waitcnt vmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX9_10: s_waitcnt vmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) {
bb:
@@ -81,7 +81,7 @@ bb:
; GCN-LABEL: barrier_vmcnt_flat:
; GCN: flat_load_dword
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) {
bb:
@@ -102,9 +102,9 @@ bb:
; GCN-LABEL: barrier_vscnt_flat:
; GCN: flat_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) {
bb:
@@ -127,8 +127,8 @@ bb:
; GCN-LABEL: barrier_vmcnt_vscnt_flat:
; GCN: flat_load_dword
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) {
bb:
@@ -153,8 +153,8 @@ bb:
; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
; GCN: flat_load_dword
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_barrier
define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
bb:
@@ -218,8 +218,8 @@ bb:
; GCN-LABEL: store_vscnt_private:
; GCN: buffer_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8_9: s_waitcnt vmcnt(0)
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_setpc_b64
define void @store_vscnt_private(i32 addrspace(5)* %p) {
store i32 0, i32 addrspace(5)* %p
@@ -229,8 +229,8 @@ define void @store_vscnt_private(i32 addrspace(5)* %p) {
; GCN-LABEL: store_vscnt_global:
; GFX8: flat_store_dword
; GFX9_10: global_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8_9: s_waitcnt vmcnt(0)
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_setpc_b64
define void @store_vscnt_global(i32 addrspace(1)* %p) {
store i32 0, i32 addrspace(1)* %p
@@ -239,9 +239,9 @@ define void @store_vscnt_global(i32 addrspace(1)* %p) {
; GCN-LABEL: store_vscnt_flat:
; GCN: flat_store_dword
-; GFX8_9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_setpc_b64
define void @store_vscnt_flat(i32* %p) {
store i32 0, i32* %p
@@ -249,8 +249,8 @@ define void @store_vscnt_flat(i32* %p) {
}
; GCN-LABEL: function_prologue:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
+; GFX10: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_setpc_b64
define void @function_prologue() {
ret void
OpenPOWER on IntegriCloud