diff options
Diffstat (limited to 'llvm/test')
21 files changed, 376 insertions, 285 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll index cd1ce13eb16..c21abaeaaf5 100644 --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -4,15 +4,14 @@ %struct.ByValStruct = type { [4 x i32] } ; GCN-LABEL: {{^}}void_func_byval_struct: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}} ; GCN-NOT: s32 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:20{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:20{{$}} +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:20{{$}} ; GCN-NOT: s32 define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { entry: @@ -183,15 +182,14 @@ entry: } ; GCN-LABEL: {{^}}void_func_byval_struct_align8: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}} ; GCN-NOT: s32 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:24{{$}} ; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:24{{$}} +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:24{{$}} ; GCN-NOT: s32 define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 4d05fcc7de2..79a238a287d 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -768,16 +768,17 @@ entry: } ; GCN-LABEL: {{^}}tail_call_byval_align16: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:28 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:24 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:32 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:36 -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:20 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 +; GCN-NOT: s32 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:20 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; GCN: s_getpc_b64 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:24 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:28 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { entry: @@ -787,16 +788,17 @@ entry: } ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:8 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 +; GCN-NOT: s32 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; GCN: s_getpc_b64 -; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index bc9160772e2..70c69d9f6c0 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -22,9 +22,8 @@ define void @callee_no_stack_no_fp_elim() #1 { ; GCN-LABEL: {{^}}callee_with_stack: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack() #0 { @@ -100,7 +99,7 @@ declare void @external_void_func_void() #0 ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN: v_writelane_b32 v32 @@ -108,7 +107,7 @@ declare void @external_void_func_void() #0 ; GCN: v_readlane_b32 s{{[0-9]+}}, v32 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll index 5060c0fed1a..fa5d20a1e96 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -115,8 +115,8 @@ define void @use_workgroup_id_x() #1 { ; GCN-LABEL: {{^}}use_stack_workgroup_id_x: ; GCN: s_waitcnt -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4 +; GCN-NOT: s32 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GCN: ; use s6 ; GCN: s_setpc_b64 define void @use_stack_workgroup_id_x() #1 { @@ -429,7 +429,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 { } ; GCN-LABEL: {{^}}use_every_sgpr_input: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index c63d96917d9..aa33dfa3675 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -230,12 +230,11 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_x( @@ -357,12 +356,12 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x( ; frame[3] = VGPR spill slot ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 -; GCN: buffer_load_dword v0, off, s[0:3], s5 offset:4 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload +; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN: s_setpc_b64 define void @too_many_args_use_workitem_id_x_byval( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, @@ -476,16 +475,15 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { } ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12{{$}} +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @too_many_args_use_workitem_id_xyz( @@ -574,11 +572,10 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { ; frame[2] = workitem Z ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz: -; GCN: s_mov_b32 s5, s32 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 -; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 -; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:8{{$}} ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index 92a255ceae6..819b0e5bfc5 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -7,7 +7,7 @@ ; Materialize into a mov. Make sure there isn't an unnecessary copy. ; GCN-LABEL: {{^}}func_mov_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 s6, s5, s4 +; GCN: s_sub_u32 s6, s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 ; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] @@ -28,7 +28,7 @@ define void @func_mov_fi_i32() #0 { ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 s6, s5, s4 +; GCN: s_sub_u32 s6, s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 ; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] @@ -52,7 +52,7 @@ define void @func_add_constant_to_fi_i32() #0 { ; into. ; GCN-LABEL: {{^}}func_other_fi_user_i32: -; GCN: s_sub_u32 s6, s5, s4 +; GCN: s_sub_u32 s6, s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 ; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] @@ -89,8 +89,7 @@ define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 s5, s32 -; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s5, s4 +; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 ; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 ; CI-NEXT: v_add_i32_e64 [[ADD:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]] @@ -112,9 +111,8 @@ define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 } addrspace(5)* byval % ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s32 -; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5 -; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 +; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s32 +; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* byval %arg0) #0 { %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1 @@ -129,7 +127,7 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b ; FrameIndex is hidden behind a CopyFromReg in the second block. ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: -; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s5, s4 +; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 ; CI: v_add_i32_e64 [[ADD:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]] @@ -163,7 +161,7 @@ ret: ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: -; GCN: s_sub_u32 s6, s5, s4 +; GCN: s_sub_u32 s6, s32, s4 ; GCN-DAG: s_movk_i32 s6, 0x204 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 @@ -187,7 +185,7 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { } ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live: -; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s5, s4 +; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s4 ; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6 @@ -243,7 +241,7 @@ bb5: ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: ; GCN: s_and_saveexec_b64 -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s5 offset:12 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:12 define void @alloca_ptr_nonentry_block(i32 %arg0) #0 { %alloca0 = alloca { i8, i32 }, align 4, addrspace(5) %cmp = icmp eq i32 %arg0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 442cdc92c30..9186b91f763 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -220,7 +220,7 @@ define void @void_func_v32i32(<32 x i32> %arg0) #0 { ; GCN-DAG: buffer_store_dwordx4 v[4:7], off ; GCN-DAG: buffer_store_dwordx4 v[8:11], off ; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s5 +; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s32 ; GCN-DAG: buffer_store_dwordx4 v[16:19], off ; GCN-DAG: buffer_store_dwordx4 v[20:23], off ; GCN-DAG: buffer_store_dwordx4 v[24:27], off @@ -516,8 +516,8 @@ define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 { } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32: -; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} ; GCN-DAG: buffer_store_dword v[[ELT1]] ; GCN-DAG: buffer_store_byte v[[ELT0]] define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval %arg0) #0 { @@ -527,10 +527,10 @@ define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval %arg0 } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2: -; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN: ds_write_b32 v0, v0 ; GCN: s_setpc_b64 @@ -544,9 +544,9 @@ define void @void_func_byval_struct_i8_i32_x2({ i8, i32 } addrspace(5)* byval %a } ; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64: -; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} ; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval %arg0, i64 addrspace(5)* byval %arg1) #0 { @@ -566,9 +566,9 @@ define void @void_func_byval_i32_byval_i64(i32 addrspace(5)* byval %arg0, i64 ad ; GCN-DAG: buffer_store_dwordx4 v[20:23], off ; GCN-DAG: buffer_store_dwordx4 v[24:27], off ; GCN-DAG: buffer_store_dwordx4 v[28:31], off -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:8 -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:12 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12 ; GCN: buffer_store_dword v[[LOAD_ARG1]] ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off @@ -581,14 +581,14 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 ; FIXME: Different ext load types on CI vs. VI ; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16: -; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]] ; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]] @@ -609,10 +609,10 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1 } ; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off @@ -624,8 +624,8 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 } ; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16: -; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} ; GFX9: buffer_store_dword [[LOAD_ARG1]], off ; GFX9: buffer_store_short [[LOAD_ARG2]], off define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 { @@ -636,15 +636,15 @@ define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 } ; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off @@ -656,15 +656,15 @@ define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 } ; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off @@ -676,23 +676,23 @@ define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 } ; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:32{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:60{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:64{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}} ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off @@ -706,39 +706,39 @@ define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 } ; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s5 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s5 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s5 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s5 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s5 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s5 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s5 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s5 offset:60{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:64{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:68{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:72{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:76{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:80{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:84{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:88{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:92{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:96{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s5 offset:100{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s5 offset:104{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s5 offset:108{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s5 offset:112{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s5 offset:116{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s5 offset:120{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s5 offset:124{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:128{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:60{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:64{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:68{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:72{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:76{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:80{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:84{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:88{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:92{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:96{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:100{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:104{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:108{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:112{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:116{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:120{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:124{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:128{{$}} define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 { store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll index 357ce3d9a9a..07ae3d00d07 100644 --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -494,13 +494,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 @@ -513,13 +513,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 { entry: %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045 @@ -568,13 +568,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { entry: %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 @@ -588,13 +588,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 { entry: %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 @@ -609,13 +609,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 { entry: %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 @@ -630,13 +630,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 { entry: %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091 @@ -789,7 +789,7 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094 +; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -806,7 +806,7 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -824,7 +824,7 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -975,9 +975,9 @@ entry: ; FIXME: Is there a cost to using the extload over not? ; GCN-LABEL: {{^}}load_private_v2i16_split: ; GCN: s_waitcnt -; GFX900: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}} +; GFX900: buffer_load_ushort v0, off, s[0:3], s32 offset:4{{$}} ; GFX900-NEXT: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6 +; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:6 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll index 62bc8c26988..47f5ea35550 100644 --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -590,13 +590,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -609,7 +609,7 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_ushort v1, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_ushort v1, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900: v_and_b32 ; GFX900: v_lshl_or_b32 @@ -618,7 +618,7 @@ entry: ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045 @@ -631,13 +631,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -704,13 +704,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -724,13 +724,13 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8: ; GCN: s_waitcnt -; GFX900: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095{{$}} +; GFX900: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}} +; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 { entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -895,7 +895,7 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094 +; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 ; NO-D16-HI: buffer_load_ushort v define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { @@ -914,7 +914,7 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 ; NO-D16-HI: buffer_load_sbyte v define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { @@ -934,7 +934,7 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 ; NO-D16-HI: buffer_load_ubyte v define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { @@ -954,7 +954,7 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 ; NO-D16-HI: buffer_load_sbyte v define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { @@ -975,7 +975,7 @@ entry: ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 ; NO-D16-HI: buffer_load_ubyte v define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll index 4822818e901..361b8035f61 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -126,7 +126,7 @@ entry: ; CHECK-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s4 ; CHECK-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], [[IDX_S]] ; CHECK-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec -; CHECK-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s5 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0-DAG: buffer_store_dword [[IDX_V]], off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]: ; CHECK-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload @@ -149,22 +149,22 @@ entry: ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] ; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]] -; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s5 offset:[[IDX_OFF]] ; 4-byte Folded Reload +; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload ; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen ; CHECK-O0: s_waitcnt vmcnt(0) -; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]] ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB0]] ; CHECK-O0: v_readlane_b32 s[[S1:[0-9]+]], v{{[0-9]+}}, 4 ; CHECK-O0: v_readlane_b32 s[[S2:[0-9]+]], v{{[0-9]+}}, 5 ; CHECK-O0: s_mov_b64 exec, s{{\[}}[[S1]]:[[S2]]{{\]}} -; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload +; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_cbranch_execz [[TERMBB:BB[0-9]+_[0-9]+]] ; CHECK-O0: BB{{[0-9]+_[0-9]+}}: ; CHECK-O0-DAG: s_mov_b64 s{{\[}}[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]{{\]}}, exec -; CHECK-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s5 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] ; CHECK-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] @@ -189,21 +189,21 @@ entry: ; CHECK-O0: v_cmp_eq_u64_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v{{\[}}[[VRSRC2]]:[[VRSRC3]]{{\]}} ; CHECK-O0: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] ; CHECK-O0: s_and_saveexec_b64 [[CMP]], [[CMP]] -; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s5 offset:[[IDX_OFF]] ; 4-byte Folded Reload +; CHECK-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s[0:3], s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload ; CHECK-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, {{.*}} idxen ; CHECK-O0: s_waitcnt vmcnt(0) -; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill +; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill ; CHECK-O0: s_xor_b64 exec, exec, [[CMP]] ; CHECK-O0-NEXT: s_cbranch_execnz [[LOOPBB1]] ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]] ; CHECK-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]] ; CHECK-O0: s_mov_b64 exec, s{{\[}}[[SAVEEXEC0]]:[[SAVEEXEC1]]{{\]}} -; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload -; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s5 offset:[[RES_OFF]] ; 4-byte Folded Spill +; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload +; CHECK-O0: buffer_store_dword [[RES]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Spill ; CHECK-O0: [[TERMBB]]: -; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s5 offset:[[RES_OFF]] ; 4-byte Folded Reload +; CHECK-O0: buffer_load_dword [[RES:v[0-9]+]], off, s[0:3], s32 offset:[[RES_OFF]] ; 4-byte Folded Reload ; CHECK-O0: global_store_dword v[{{[0-9]+:[0-9]+}}], [[RES]], off define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, float addrspace(1)* %in, float addrspace(1)* %out) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir index 3aae677844d..06075ac8e33 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -19,26 +19,26 @@ machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr5 frameOffsetReg: $sgpr5 + stackPtrOffsetReg: $sgpr32 body: | ; CHECK-LABEL: name: scavenge_register_position ; CHECK: bb.0: ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr4, $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr5 = COPY $sgpr4 - ; CHECK: $sgpr6 = S_ADD_U32 $sgpr5, 524288, implicit-def $scc + ; CHECK: $sgpr6 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_BRANCH %bb.1 ; CHECK: bb.1: - ; CHECK: liveins: $sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr4 = S_ADD_U32 $sgpr5, 524288, implicit-def $scc + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_ENDPGM 0, implicit $vgpr0 bb.0: - $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) S_BRANCH %bb.1 bb.1: - $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) S_ENDPGM 0, implicit $vgpr0 ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir index 906e37e9926..f69c324f948 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -33,16 +33,16 @@ # SHARE: stack-id: 1, callee-saved-register: '', callee-saved-restored: true, # SHARE: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -# SHARE: SI_SPILL_S32_SAVE $sgpr5, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 4 into %stack.2, addrspace 5) -# SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 8 into %stack.1, align 4, addrspace 5) -# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) +# SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 -# SHARE: $sgpr5 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 4 from %stack.2, addrspace 5) -# SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) +# SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) +# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0 -# SHARE: $sgpr5 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 4 from %stack.2, addrspace 5) +# SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) # NOSHARE: stack: # NOSHARE: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, @@ -58,17 +58,17 @@ # NOSHARE: stack-id: 1, callee-saved-register: '', callee-saved-restored: true, # NOSHARE: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } -# NOSHARE: SI_SPILL_S32_SAVE $sgpr5, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 4 into %stack.2, addrspace 5) -# NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 8 into %stack.1, align 4, addrspace 5) -# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) +# NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 -# NOSHARE: $sgpr5 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 4 from %stack.2, addrspace 5) -# NOSHARE: SI_SPILL_S32_SAVE $sgpr5, %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (store 4 into %stack.3, addrspace 5) -# NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) +# NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.3, addrspace 5) +# NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) +# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) # NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0 -# NOSHARE: $sgpr5 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5 :: (load 4 from %stack.3, addrspace 5) +# NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.3, addrspace 5) ... @@ -79,23 +79,23 @@ frameInfo: machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 + frameOffsetReg: $sgpr32 stackPtrOffsetReg: $sgpr32 body: | bb.0: - %0:sreg_32_xm0 = COPY $sgpr5 + %0:sreg_32_xm0 = COPY $sgpr32 %1:vreg_64 = IMPLICIT_DEF %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc - ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 + ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 - $sgpr5 = COPY %0 - %4:sreg_32_xm0 = COPY $sgpr5 - ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 - ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 + $sgpr32 = COPY %0 + %4:sreg_32_xm0 = COPY $sgpr32 + ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 + ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 $vgpr0 = COPY %2 dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0 - $sgpr5 = COPY %4 - ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr5 + $sgpr32 = COPY %4 + ADJCALLSTACKDOWN 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 ... diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index ba0acbc2573..0892bb8549f 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -16,10 +16,10 @@ define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:24 +; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:24 ; GCN: s_waitcnt vmcnt(0) ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 68 @@ -40,7 +40,7 @@ entry: ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24 ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 68 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { @@ -54,7 +54,7 @@ entry: ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object: ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24 ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 136 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { @@ -84,8 +84,7 @@ entry: ; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: ; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 s5, s32 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 @@ -116,8 +115,7 @@ entry: ; GCN-NOT: v0 ; GCN-NOT: s32 ; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16 -; GCN: s_mov_b32 s5, s32 -; GCN: buffer_store_dword v1, off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GCN-NEXT: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 { entry: @@ -127,8 +125,8 @@ entry: ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] @@ -150,21 +148,20 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l ; FIXME: Why load and store same location for stack args? ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: -; GCN: s_mov_b32 s5, s32 -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8 ; GCN-NOT: s32 -; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s5 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32 offset:4 +; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 @@ -175,12 +172,8 @@ entry: } ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: -; GCN-DAG: s_mov_b32 s5, s32 -; GCN-NOT: s32 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:44 - -; GCN-NOT: s32 +; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:44 ; GCN: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -217,7 +210,6 @@ entry: ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill ; GCN-DAG: v_writelane_b32 v34, s33, 0 ; GCN-DAG: v_writelane_b32 v34, s34, 1 -; GCN-DAG: v_writelane_b32 v34, s35, 2 ; GCN-DAG: s_getpc_b64 ; GCN: s_swappc_b64 @@ -228,7 +220,6 @@ entry: ; GCN-DAG: v_readlane_b32 s33, v34, 0 ; GCN-DAG: v_readlane_b32 s34, v34, 1 -; GCN-DAG: v_readlane_b32 s35, v34, 2 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 @@ -249,8 +240,12 @@ entry: ; in same place at function exit. ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: -; GCN: s_mov_b32 s5, s32 -; GCN-NOT: s32 +; GCN-NOT: s33 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: + +; GCN-NOT: s33 + +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: ; GCN: s_setpc_b64 s[6:7] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { entry: @@ -262,8 +257,10 @@ entry: } ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: -; GCN: s_mov_b32 s5, s32 -; GCN-NOT: s32 +; GCN-NOT: s33 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48 + +; GCN-NOT: s33 ; GCN: s_setpc_b64 s[6:7] define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll new file mode 100644 index 00000000000..e1f6eb715a3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll @@ -0,0 +1,102 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=MESA3D,ALL %s +; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=UNKNOWN,ALL %s + +; Make sure shaders pick a workable SP with > 32 input SGPRs. +; FIXME: Doesn't seem to be getting initial value from right register? + +; ALL-LABEL: {{^}}too_many_input_sgprs_32: +; MESA3D-NOT: s34 +; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s34 offset:4 + +; Happens to end up in s32 anyway +; UNKNOWN-NOT: s32 +; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 +define amdgpu_ps i32 @too_many_input_sgprs_32(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, + i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15, + i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23, + i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31) { +bb: + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + %tmp = add i32 %arg, %arg1 + %tmp32 = add i32 %tmp, %arg2 + %tmp33 = add i32 %tmp32, %arg3 + %tmp34 = add i32 %tmp33, %arg4 + %tmp35 = add i32 %tmp34, %arg5 + %tmp36 = add i32 %tmp35, %arg6 + %tmp37 = add i32 %tmp36, %arg7 + %tmp38 = add i32 %tmp37, %arg8 + %tmp39 = add i32 %tmp38, %arg9 + %tmp40 = add i32 %tmp39, %arg10 + %tmp41 = add i32 %tmp40, %arg11 + %tmp42 = add i32 %tmp41, %arg12 + %tmp43 = add i32 %tmp42, %arg13 + %tmp44 = add i32 %tmp43, %arg14 + %tmp45 = add i32 %tmp44, %arg15 + %tmp46 = add i32 %tmp45, %arg16 + %tmp47 = add i32 %tmp46, %arg17 + %tmp48 = add i32 %tmp47, %arg18 + %tmp49 = add i32 %tmp48, %arg19 + %tmp50 = add i32 %tmp49, %arg20 + %tmp51 = add i32 %tmp50, %arg21 + %tmp52 = add i32 %tmp51, %arg22 + %tmp53 = add i32 %tmp52, %arg23 + %tmp54 = add i32 %tmp53, %arg24 + %tmp55 = add i32 %tmp54, %arg25 + %tmp56 = add i32 %tmp55, %arg26 + %tmp57 = add i32 %tmp56, %arg27 + %tmp58 = add i32 %tmp57, %arg28 + %tmp59 = add i32 %tmp58, %arg29 + %tmp60 = add i32 %tmp59, %arg30 + %tmp61 = add i32 %tmp60, %arg31 + ret i32 %tmp61 +} + +; ALL-LABEL: {{^}}too_many_input_sgprs_33: +; MESA3D-NOT: s35 +; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s35 offset:4 + +; UNKNOWN-NOT: s33 +; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s33 offset:4 +define amdgpu_ps i32 @too_many_input_sgprs_33(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, + i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15, + i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23, + i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31, + i32 inreg %arg32) { +bb: + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %alloca + %tmp = add i32 %arg, %arg1 + %tmp32 = add i32 %tmp, %arg2 + %tmp33 = add i32 %tmp32, %arg3 + %tmp34 = add i32 %tmp33, %arg4 + %tmp35 = add i32 %tmp34, %arg5 + %tmp36 = add i32 %tmp35, %arg6 + %tmp37 = add i32 %tmp36, %arg7 + %tmp38 = add i32 %tmp37, %arg8 + %tmp39 = add i32 %tmp38, %arg9 + %tmp40 = add i32 %tmp39, %arg10 + %tmp41 = add i32 %tmp40, %arg11 + %tmp42 = add i32 %tmp41, %arg12 + %tmp43 = add i32 %tmp42, %arg13 + %tmp44 = add i32 %tmp43, %arg14 + %tmp45 = add i32 %tmp44, %arg15 + %tmp46 = add i32 %tmp45, %arg16 + %tmp47 = add i32 %tmp46, %arg17 + %tmp48 = add i32 %tmp47, %arg18 + %tmp49 = add i32 %tmp48, %arg19 + %tmp50 = add i32 %tmp49, %arg20 + %tmp51 = add i32 %tmp50, %arg21 + %tmp52 = add i32 %tmp51, %arg22 + %tmp53 = add i32 %tmp52, %arg23 + %tmp54 = add i32 %tmp53, %arg24 + %tmp55 = add i32 %tmp54, %arg25 + %tmp56 = add i32 %tmp55, %arg26 + %tmp57 = add i32 %tmp56, %arg27 + %tmp58 = add i32 %tmp57, %arg28 + %tmp59 = add i32 %tmp58, %arg29 + %tmp60 = add i32 %tmp59, %arg30 + %tmp61 = add i32 %tmp60, %arg31 + %tmp62 = add i32 %tmp61, %arg32 + ret i32 %tmp62 +} diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir index cb804bafb54..0cf19cea781 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -8,12 +8,12 @@ # CHECK-LABEL: name: expecting_non_empty_interval # CHECK: undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $exec -# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) +# CHECK-NEXT: SI_SPILL_V64_SAVE %7, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) # CHECK-NEXT: undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec # CHECK-NEXT: dead %3:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $exec # CHECK: S_NOP 0, implicit %6.sub1 -# CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) +# CHECK-NEXT: %8:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) # CHECK-NEXT: S_NOP 0, implicit %8.sub1 # CHECK-NEXT: S_NOP 0, implicit undef %9.sub0 @@ -22,7 +22,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 body: | bb.0: @@ -57,7 +56,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 body: | bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index 57722e202bc..972d4e6692b 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -171,7 +171,7 @@ entry: %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) - ; CHECK: s_add_u32 s6, s5, 0x40000 + ; CHECK: s_add_u32 s6, s32, 0x40000 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr @@ -223,7 +223,7 @@ entry: %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff00 / 64 = 4092 (for wave64) - ; CHECK: s_add_u32 s6, s5, 0x3ff00 + ; CHECK: s_add_u32 s6, s32, 0x3ff00 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index aece86d9a31..91453d73c9e 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -9,7 +9,10 @@ ; = 144 bytes with padding between them ; GCN-LABEL: {{^}}needs_align16_default_stack_align: -; GCN: s_mov_b32 s5, s32 +; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s4 +; GCN-NEXT: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]] +; GCN: v_add_u32_e64 [[FI:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 16, [[FRAMEDIFF]] + ; GCN-NOT: s32 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen @@ -135,9 +138,7 @@ define void @default_realign_align128(i32 %idx) #0 { ; GCN-LABEL: {{^}}disable_realign_align128: ; GCN-NOT: s32 -; GCN: s_mov_b32 s5, s32 -; GCN-NOT: s32 -; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:16 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; GCN-NOT: s32 define void @disable_realign_align128(i32 %idx) #3 { %alloca.align = alloca i32, align 128, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir index dc02327395d..11cf52ba3e5 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -9,11 +9,11 @@ # CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, # CHECK-NEXT: stack-id: 1, -# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) +# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) +# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr6, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5, implicit-def dead $m0 :: (store 4 into %stack.1, addrspace 5) -# CHECK: $sgpr6 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr5, implicit-def dead $m0 :: (load 4 from %stack.1, addrspace 5) +# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr6, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32, implicit-def dead $m0 :: (store 4 into %stack.1, addrspace 5) +# CHECK: $sgpr6 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32, implicit-def dead $m0 :: (load 4 from %stack.1, addrspace 5) name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll index dbb09217dac..2f9d0616fa6 100644 --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -481,10 +481,10 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset: ; GCN: s_waitcnt -; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}} +; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s5 offset:4094{{$}} +; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -635,7 +635,7 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: ; GCN: s_waitcnt ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094 +; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094 define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -652,7 +652,7 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: ; GCN: s_waitcnt ; GFX900: buffer_store_dword -; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095 +; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095 define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir index cc864bc02ec..f4932d6987c 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -42,7 +42,6 @@ tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 liveins: - { reg: '$vgpr2', virtual-reg: '%0' } @@ -112,7 +111,7 @@ body: | ; and inserting a spill. Here we just check that the point where the error ; occurs we see a correctly generated spill. ; GCN-LABEL: bb.7: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -128,7 +127,7 @@ body: | successors: %bb.12(0x80000000) ; GCN-LABEL: bb.9: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -139,7 +138,7 @@ body: | successors: %bb.12(0x80000000) ; GCN-LABEL: bb.10: - ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, implicit $exec + ; GCN: SI_SPILL_V128_SAVE %{{[0-9]+}}, %stack.1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index 01bcd6fd84b..8ff09bbd237 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -18,7 +18,7 @@ ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' ; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101' ; CHECK-NEXT: frameOffsetReg: '$sgpr101' -; CHECK-NEXT: stackPtrOffsetReg: '$sp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101' ; CHECK-NEXT: body: define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { %gep = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %arg0 @@ -38,7 +38,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' ; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101' ; CHECK-NEXT: frameOffsetReg: '$sgpr101' -; CHECK-NEXT: stackPtrOffsetReg: '$sp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101' ; CHECK-NEXT: body: define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { ret void |