diff options
Diffstat (limited to 'llvm/test')
19 files changed, 384 insertions, 111 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/call-return-types.ll b/llvm/test/CodeGen/AMDGPU/call-return-types.ll index af87c0deb55..8751c61dcd4 100644 --- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll @@ -174,10 +174,8 @@ define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 { ; GCN-LABEL: {{^}}test_call_external_v3i32_func_void: ; GCN: s_swappc -; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1] -; GFX7-DAG: flat_store_dword {{.*}}, v2 -; GFX89-DAG: buffer_store_dwordx2 v[0:1] -; GFX89-DAG: buffer_store_dword v2 +; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] +; GFX89-DAG: buffer_store_dwordx3 v[0:2] define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 { %val = call <3 x i32> @external_v3i32_func_void() store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8 @@ -254,10 +252,8 @@ define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 { ; GCN-LABEL: {{^}}test_call_external_v3f32_func_void: ; GCN: s_swappc -; GFX7-DAG: flat_store_dwordx2 {{.*}}, v[0:1] -; GFX7-DAG: flat_store_dword {{.*}}, v2 -; GFX89-DAG: buffer_store_dwordx2 v[0:1] -; GFX89-DAG: buffer_store_dword v2 +; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] +; GFX89-DAG: buffer_store_dwordx3 v[0:2] define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 { %val = call <3 x float> @external_v3f32_func_void() store volatile <3 x float> %val, <3 x float> addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll index 626a6e2c5b8..cd3aeb48faa 100644 --- a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll +++ b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -1,4 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCNX3 %s ; FIXME: Most of these cases that don't trigger because of broken cost ; heuristics. Should not need -stress-early-ifcvt @@ -60,8 +61,9 @@ endif: ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc -; GCN-DAG: buffer_store_dword v -; GCN-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword v +; GCNX3: buffer_store_dwordx3 define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll index 6061f53e959..11826d124df 100644 --- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll +++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll @@ -316,10 +316,10 @@ endif: ; GCN: s_add_i32 ; GCN: s_add_i32 ; GCN: s_add_i32 -; GCN: s_add_i32 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 -; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} -; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_cselect_b32 s +; GCN-NEXT: s_cselect_b32 s +; GCN-NEXT: s_cselect_b32 s define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 { entry: %v = load <3 x i32>, <3 x i32> addrspace(4)* %in diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index e4a18bf6fa1..442cdc92c30 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -161,8 +161,7 @@ define void @void_func_v2i32(<2 x i32> %arg0) #0 { } ; GCN-LABEL: {{^}}void_func_v3i32: -; GCN-DAG: buffer_store_dword v2, off -; GCN-DAG: buffer_store_dwordx2 v[0:1], off +; GCN-DAG: buffer_store_dwordx3 v[0:2], off define void @void_func_v3i32(<3 x i32> %arg0) #0 { store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef ret void @@ -356,8 +355,7 @@ define void @void_func_v2f32(<2 x float> %arg0) #0 { } ; GCN-LABEL: {{^}}void_func_v3f32: -; GCN-DAG: buffer_store_dword v2, off -; GCN-DAG: buffer_store_dwordx2 v[0:1], off +; GCN-DAG: buffer_store_dwordx3 v[0:2], off define void @void_func_v3f32(<3 x float> %arg0) #0 { store <3 x float> %arg0, <3 x float> addrspace(1)* undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index f0927dec271..f9631e615c9 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -140,7 +140,7 @@ define <2 x i32> @v2i32_func_void() #0 { } ; GCN-LABEL: {{^}}v3i32_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off +; GCN: buffer_load_dwordx3 v[0:2], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <3 x i32> @v3i32_func_void() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 49b4d71bcf2..1908015f477 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -78,14 +78,13 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: ; GCN: s_load_dwordx2 s -; GCN: s_load_dwordx2 s -; GCN-NOT: _load ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 +; GCN: s_load_dwordx2 s +; GCN-NOT: _load ; GCN: v_cvt_f32_f16_e32 ; GCN-NOT: v_cvt_f32_f16 -; GCN-DAG: _store_dword -; GCN-DAG: _store_dwordx2 +; GCN-DAG: _store_dwordx3 ; GCN: s_endpgm define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { %ext = fpext <3 x half> %arg to <3 x float> @@ -472,7 +471,7 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace } ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: -; GCN: flat_load_dwordx4 +; GCN: flat_load_dwordx3 ; GCN-DAG: v_cvt_f16_f32_e32 ; SI-DAG: v_cvt_f16_f32_e32 ; VI-DAG: v_cvt_f16_f32_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index bac1a9e3f41..3dea940ec3f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1778,29 +1778,29 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 -; GFX8-NEXT: s_lshr_b32 s4, s2, 24 -; GFX8-NEXT: s_and_b32 s6, s1, s0 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_and_b32 s7, s1, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: v_mul_u32_u24_e32 v4, s6, v4 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s5, v5 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_and_b32 s5, s0, s8 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_mov_b32_e32 v7, s2 +; GFX8-NEXT: v_mul_u32_u24_e32 v4, s5, v4 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s4, v5 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll index 77557a58409..88a9405f0ac 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC -check-prefix=SI-NOHSA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC -check-prefix=GCNX3-HSA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC -check-prefix=GCNX3-NOHSA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s @@ -30,8 +30,9 @@ entry: } ; FUNC-LABEL: {{^}}global_load_v3f32: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx3 +; GCNX3-HSA: flat_load_dwordx3 ; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index 27081b8379b..fcbf8a1fc4a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -1,8 +1,8 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI-NOHSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=GCNX3-NOHSA -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=GCNX3-HSA -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i32: @@ -30,8 +30,9 @@ entry: } ; FUNC-LABEL: {{^}}global_load_v3i32: -; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: {{flat|global}}_load_dwordx4 +; SI-NOHSA: buffer_load_dwordx4 +; GCNX3-NOHSA: buffer_load_dwordx3 +; GCNX3-HSA: {{flat|global}}_load_dwordx3 ; EG: VTX_READ_128 define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 03e872192d1..fbbb8f80ccb 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -146,12 +146,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; FIXME: Should be packed into 2 registers per argument? ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX9-DAG: v_mad_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX9-DAG: v_pk_max_f16 v1, v1, v1 clamp +; GFX9: v_mov_b32_e32 v0, v{{[0-9]+}} ; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { %src0.ext = fpext <3 x half> %src0 to <3 x float> diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll index 7fb0e354df2..a5d111b4977 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -275,8 +275,7 @@ define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace } ; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32: -; SI-DAG: buffer_load_dwordx2 -; SI-DAG: buffer_load_dword v +; SI-DAG: buffer_load_dwordx4 ; CI-DAG: buffer_load_dwordx3 ; GCN: s_waitcnt ; SI-DAG: buffer_store_dwordx2 @@ -566,8 +565,8 @@ define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* ; GCN-LABEL: {{^}}merge_global_store_7_constants_i32: ; GCN: buffer_store_dwordx4 -; SI: buffer_store_dwordx2 -; SI: buffer_store_dword v +; SI-DAG: buffer_store_dwordx2 +; SI-DAG: buffer_store_dword v ; CI: buffer_store_dwordx3 define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 @@ -615,8 +614,7 @@ define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* ; GCN-LABEL: {{^}}copy_v3i32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt @@ -650,8 +648,7 @@ define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %ou ; GCN-LABEL: {{^}}copy_v3f32_align4: ; GCN-NOT: SCRATCH_RSRC_DWORD -; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN-NOT: offen ; GCN: s_waitcnt vmcnt diff --git a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll new file mode 100644 index 00000000000..3c55dc8ef91 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll @@ -0,0 +1,216 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s + +; CHECK-LABEL: spill_v2i32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload + +define void @spill_v2i32() { +entry: + %alloca = alloca <2 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1 + store volatile <2 x i32> %a, <2 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v2f32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload + +define void @spill_v2f32() { +entry: + %alloca = alloca <2 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %alloca, i32 1 + store volatile <2 x i32> %a, <2 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v3i32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload + +define void @spill_v3i32() { +entry: + %alloca = alloca <3 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <3 x i32>, <3 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1 + store volatile <3 x i32> %a, <3 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v3f32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload + +define void @spill_v3f32() { +entry: + %alloca = alloca <3 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <3 x i32>, <3 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <3 x i32>, <3 x i32> addrspace(5)* %alloca, i32 1 + store volatile <3 x i32> %a, <3 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v4i32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload + +define void @spill_v4i32() { +entry: + %alloca = alloca <4 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <4 x i32>, <4 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1 + store volatile <4 x i32> %a, <4 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v4f32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload + +define void @spill_v4f32() { +entry: + %alloca = alloca <4 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <4 x i32>, <4 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <4 x i32>, <4 x i32> addrspace(5)* %alloca, i32 1 + store volatile <4 x i32> %a, <4 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v5i32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload + +define void @spill_v5i32() { +entry: + %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <5 x i32>, <5 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1 + store volatile <5 x i32> %a, <5 x i32> addrspace(5)* %outptr + + ret void +} + +; CHECK-LABEL: spill_v5f32: +; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill +; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill +; CHECK: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload +; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload + +define void @spill_v5f32() { +entry: + %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5) + + %aptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1 + %a = load volatile <5 x i32>, <5 x i32> addrspace(5)* %aptr + + ; Force %a to spill. + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr <5 x i32>, <5 x i32> addrspace(5)* %alloca, i32 1 + store volatile <5 x i32> %a, <5 x i32> addrspace(5)* %outptr + + ret void +} + + + diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll index 6ead8cc409e..26923f8c3eb 100644 --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -112,6 +112,7 @@ define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16 ; GFX89: v_cndmask_b32_e32 ; GFX89: cndmask +; VI: cndmask ; GFX89-NOT: cndmask define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr @@ -230,6 +231,21 @@ define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x f ret void } +; GCN-LABEL: {{^}}s_select_v3f32: +; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} + +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 + +; GCN: buffer_store_dwordx +define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <3 x float> %a, <3 x float> %b + store <3 x float> %select, <3 x float> addrspace(1)* %out, align 16 + ret void +} + ; GCN-LABEL: {{^}}s_select_v4f32: ; GCN: s_load_dwordx4 ; GCN: s_load_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll index cb4601ac165..c95beaea5a7 100644 --- a/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll +++ b/llvm/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SICI %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,SI %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefixes=GCN,SICI,CI %s ; Check that an addrspace(1) (const) load with various combinations of ; uniform, nonuniform and constant address components all load with an @@ -8,7 +9,8 @@ ; GCN-LABEL: {{^}}nonuniform_uniform: ; GCN-NOT: readfirstlane -; SICI: buffer_load_dwordx4 {{.*}} addr64 +; SI: buffer_load_dwordx4 {{.*}} addr64 +; CI: buffer_load_dwordx3 {{.*}} addr64 define amdgpu_ps float @nonuniform_uniform(i32 %arg18) { .entry: @@ -21,7 +23,8 @@ define amdgpu_ps float @nonuniform_uniform(i32 %arg18) { ; GCN-LABEL: {{^}}uniform_nonuniform: ; GCN-NOT: readfirstlane -; SICI: buffer_load_dwordx4 {{.*}} addr64 +; SI: buffer_load_dwordx4 {{.*}} addr64 +; CI: buffer_load_dwordx3 {{.*}} addr64 define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) { .entry: @@ -35,7 +38,8 @@ define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) { ; GCN-LABEL: {{^}}const_nonuniform: ; GCN-NOT: readfirstlane -; SICI: buffer_load_dwordx4 {{.*}} addr64 +; SI: buffer_load_dwordx4 {{.*}} addr64 +; CI: buffer_load_dwordx3 {{.*}} addr64 define amdgpu_ps float @const_nonuniform(i32 %arg18) { .entry: @@ -49,7 +53,8 @@ define amdgpu_ps float @const_nonuniform(i32 %arg18) { ; GCN-LABEL: {{^}}nonuniform_nonuniform: ; GCN-NOT: readfirstlane -; SICI: buffer_load_dwordx4 {{.*}} addr64 +; SI: buffer_load_dwordx4 {{.*}} addr64 +; CI: buffer_load_dwordx3 {{.*}} addr64 define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) { .entry: diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index 50bc5889669..d52a7ffcbe6 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -37,13 +37,12 @@ define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_add_i32 s4, s4, s2 +; SI-NEXT: s_mul_i32 s2, s4, s5 +; SI-NEXT: s_add_i32 s4, s2, s6 ; SI-NEXT: s_ashr_i32 s5, s4, 31 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -54,16 +53,15 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s1, s2, s3 -; VI-NEXT: s_add_i32 s1, s1, s0 -; VI-NEXT: s_ashr_i32 s0, s1, 31 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: s_add_i32 s0, s0, s2 +; VI-NEXT: s_ashr_i32 s1, s0, 31 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm entry: @@ -292,14 +290,13 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -308,14 +305,13 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v0 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll index b8824be4725..5a55ce51440 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-wide-sgpr.ll @@ -43,6 +43,54 @@ ret: ret void } +; ALL-LABEL: {{^}}spill_sgpr_x3: +; SMEM: s_add_u32 m0, s3, 0x100{{$}} +; SMEM: s_buffer_store_dword s +; SMEM: s_buffer_store_dword s +; SMEM: s_buffer_store_dword s +; SMEM: s_cbranch_scc1 + +; SMEM: s_add_u32 m0, s3, 0x100{{$}} +; SMEM: s_buffer_load_dword s +; SMEM: s_buffer_load_dword s +; SMEM: s_buffer_load_dword s +; SMEM: s_dcache_wb +; SMEM: s_endpgm + +; FIXME: Should only need 4 bytes +; SMEM: ScratchSize: 16 + +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1 +; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2 +; VGPR: s_cbranch_scc1 + +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1 +; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 2 + + +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: buffer_store_dword +; VMEM: s_cbranch_scc1 + +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +; VMEM: buffer_load_dword +define amdgpu_kernel void @spill_sgpr_x3(i32 addrspace(1)* %out, i32 %in) #0 { + %wide.sgpr = call <3 x i32> asm sideeffect "; def $0", "=s" () #0 + %cmp = icmp eq i32 %in, 0 + br i1 %cmp, label %bb0, label %ret + +bb0: + call void asm sideeffect "; use $0", "s"(<3 x i32> %wide.sgpr) #0 + br label %ret + +ret: + ret void +} + ; ALL-LABEL: {{^}}spill_sgpr_x4: ; SMEM: s_add_u32 m0, s3, 0x100{{$}} ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll index c0b9f6cdc33..a8843365421 100644 --- a/llvm/test/CodeGen/AMDGPU/store-global.ll +++ b/llvm/test/CodeGen/AMDGPU/store-global.ll @@ -273,13 +273,12 @@ entry: } ; FUNC-LABEL: {{^}}store_v3i32: -; SI-DAG: buffer_store_dwordx2 ; SI-DAG: buffer_store_dword v +; SI-DAG: buffer_store_dwordx2 -; VI-DAG: buffer_store_dwordx3 +; VI: buffer_store_dwordx3 -; GFX9-DAG: global_store_dwordx2 -; GFX9-DAG: global_store_dword v +; GFX9: global_store_dwordx3 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}}, diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll index da57155f33e..c96d0324260 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -135,7 +135,7 @@ entry: ; GCN-LABEL: {{^}}safe_mad_sub0_src0: ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, -; GCN: v_mac_f32_e32 v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} +; GCN: v_ma{{[cd]}}_f32{{[_e32]*}} v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { entry: %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index bcebc9f2d8b..0c52daca047 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -22,25 +22,25 @@ ; MESA-NOT: s_mov_b32 s3 ; HSA-NOT: s_mov_b32 s7 -; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCNMESA-DAG: s_mov_b32 s14, -1 -; SIMESA-DAG: s_mov_b32 s15, 0xe8f000 -; VIMESA-DAG: s_mov_b32 s15, 0xe80000 -; GFX9MESA-DAG: s_mov_b32 s15, 0xe00000 +; GCNMESA-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 +; GCNMESA-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 +; GCNMESA-DAG: s_mov_b32 s18, -1 +; SIMESA-DAG: s_mov_b32 s19, 0xe8f000 +; VIMESA-DAG: s_mov_b32 s19, 0xe80000 +; GFX9MESA-DAG: s_mov_b32 s19, 0xe00000 -; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCNMESAMESA: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[12:15], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_store_dword {{v[0-9]}}, off, s[16:19], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} -; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} +; GCNMESA: buffer_load_dword {{v[0-9]+}}, off, s[16:19], s3 offset:{{[0-9]+}} |