AMDGPU: Increase default stack alignment

8 and 16-byte values are common, so increase the default alignment to avoid realigning the stack in most functions. llvm-svn: 328821
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-03-29 20:22:04 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-03-29 20:22:04 +0000
commit: ffb132e74b26213ed3df4378c0de3128d9060aad (patch)
tree: ae6994b2c3087b8bcab722f56aa503849fd791fc
parent: d9911f6f7baca75a9d51352bf9ef3718ee05cd82 (diff)
download: bcm5719-llvm-ffb132e74b26213ed3df4378c0de3128d9060aad.tar.gz
bcm5719-llvm-ffb132e74b26213ed3df4378c0de3128d9060aad.zip
9 files changed, 24 insertions, 19 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 6312bb02feb..a7e52cb16c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -550,8 +550,13 @@ public:
   // Scratch is allocated in 256 dword per wave blocks for the entire
   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
   // is 4-byte aligned.
+  //
+  // Only 4-byte alignment is really needed to access anything. Transformations
+  // on the pointer value itself may rely on the alignment / known low bits of
+  // the pointer. Set this to something above the minimum to avoid needing
+  // dynamic realignment in common cases.
   unsigned getStackAlignment() const {
-    return 4;
+    return 16;
   }
 
   bool enableMachineScheduler() const override {
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index b56ec379bf1..0b34524d156 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -34,7 +34,7 @@ entry:
 ; GCN-DAG: buffer_store_dword v33
 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
 ; GCN-DAG: v_writelane_b32
-; GCN-DAG: s_add_u32 s32, s32, 0xb00{{$}}
+; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
 ; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
@@ -50,7 +50,7 @@ entry:
 ; GCN-NOT: v_readlane_b32 s32
 ; GCN: buffer_load_dword v32,
 ; GCN: buffer_load_dword v33,
-; GCN: s_sub_u32 s32, s32, 0xb00{{$}}
+; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
 ; GCN: s_setpc_b64
 define void  @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
 entry:
@@ -130,7 +130,7 @@ entry:
 
 ; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel:
 ; GCN: s_mov_b32 s33, s7
-; GCN: s_add_u32 s32, s33, 0xa00{{$}}
+; GCN: s_add_u32 s32, s33, 0xc00{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 2d3e35a6d28..b0998355395 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -475,8 +475,8 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
 }
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
-; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}}
-; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}}
+; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}}
+; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index f380bf5ccea..21c69d9bee7 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -146,7 +146,7 @@ define void @use_stack1() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_use_stack:
-; GCN: ScratchSize: 2124
+; GCN: ScratchSize: 2132
 define void @indirect_use_stack() #1 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   call void asm sideeffect "; use $0", "v"([16 x i32] addrspace(5)* %alloca) #0
@@ -156,7 +156,7 @@ define void @indirect_use_stack() #1 {
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_stack:
 ; GCN: is_dynamic_callstack = 0
-; GCN: ScratchSize: 2124
+; GCN: ScratchSize: 2132
 define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
   call void @indirect_use_stack()
   ret void
@@ -199,7 +199,7 @@ define amdgpu_kernel void @usage_external_recurse() #0 {
 }
 
 ; GCN-LABEL: {{^}}direct_recursion_use_stack:
-; GCN: ScratchSize: 2056
+; GCN: ScratchSize: 2064
 define void @direct_recursion_use_stack(i32 %val) #2 {
   %alloca = alloca [512 x i32], align 4, addrspace(5)
   call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0
@@ -218,7 +218,7 @@ ret:
 ; GCN-LABEL: {{^}}usage_direct_recursion:
 ; GCN: is_ptr64 = 1
 ; GCN: is_dynamic_callstack = 1
-; GCN: workitem_private_segment_byte_size = 2056
+; GCN: workitem_private_segment_byte_size = 2064
 define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
   call void @direct_recursion_use_stack(i32 %n)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 0d8244ea1f7..daec4930e67 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -42,7 +42,7 @@ define void @callee_with_stack() #0 {
 ; GCN-DAG: v_writelane_b32 v32, s33,
 ; GCN-DAG: v_writelane_b32 v32, s34,
 ; GCN-DAG: v_writelane_b32 v32, s35,
-; GCN-DAG: s_add_u32 s32, s32, 0x300{{$}}
+; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
 ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
 ; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
 ; GCN-DAG: s_mov_b32 s33, s5
@@ -82,7 +82,7 @@ define void @callee_with_stack_and_call() #0 {
 ; GCN-DAG: v_readlane_b32 s34, v32, 1
 ; GCN-DAG: v_readlane_b32 s33, v32, 0
 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
-; GCN: s_sub_u32 s32, s32, 0x200
+; GCN: s_sub_u32 s32, s32, 0x400
 
 ; GCN: s_setpc_b64
 define void @callee_no_stack_with_call() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index 74dee0aea0f..d797f59054d 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -558,7 +558,7 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
 
 ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill:
 ; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0x300
+; GCN: s_add_u32 s32, s32, 0x400
 
 ; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s14
 ; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-68-9][0-9]*]], s15
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 25c40dd0ada..432a4d3c985 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -423,7 +423,7 @@ define void @too_many_args_use_workitem_id_x_byval(
 ; GCN: enable_vgpr_workitem_id = 0
 
 ; GCN: s_mov_b32 s33, s7
-; GCN: s_add_u32 s32, s33, 0x200{{$}}
+; GCN: s_add_u32 s32, s33, 0x400{{$}}
 
 ; GCN-NOT: s32
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index c1105347cd3..462274c65e7 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -12,7 +12,7 @@ declare void @external_void_func_i32(i32) #0
 ; GCN: s_mov_b32 s5, s32
 ; Spill CSR VGPR used for SGPR spilling
 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
-; GCN-DAG: s_add_u32 s32, s32, 0x200
+; GCN-DAG: s_add_u32 s32, s32, 0x400
 ; GCN-DAG: v_writelane_b32 v32, s33, 0
 ; GCN-DAG: v_writelane_b32 v32, s34, 1
 ; GCN-DAG: v_writelane_b32 v32, s35, 2
@@ -23,7 +23,7 @@ declare void @external_void_func_i32(i32) #0
 ; GCN: v_readlane_b32 s34, v32, 1
 ; GCN: v_readlane_b32 s33, v32, 0
 ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
-; GCN: s_sub_u32 s32, s32, 0x200
+; GCN: s_sub_u32 s32, s32, 0x400
 ; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_i32_imm() #0 {
   call void @external_void_func_i32(i32 42)
@@ -33,10 +33,10 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
 ; GCN: s_waitcnt
 ; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0x1200{{$}}
+; GCN: s_add_u32 s32, s32, 0x1400{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
 ; GCN: s_swappc_b64
-; GCN: s_sub_u32 s32, s32, 0x1200{{$}}
+; GCN: s_sub_u32 s32, s32, 0x1400{{$}}
 ; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll
index 2db95cd820f..afe05cd79b2 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll
@@ -80,7 +80,7 @@ declare i32 @foo(i32 addrspace(5)*) #0
 ; ASM: buffer_store_dword
 ; ASM: buffer_store_dword
 ; ASM: s_swappc_b64
-; ASM: ScratchSize: 16396
+; ASM: ScratchSize: 16400
 define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %tmp = alloca [2 x i32], addrspace(5)
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-03-29 20:22:04 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-03-29 20:22:04 +0000
commit	ffb132e74b26213ed3df4378c0de3128d9060aad (patch)
tree	ae6994b2c3087b8bcab722f56aa503849fd791fc
parent	d9911f6f7baca75a9d51352bf9ef3718ee05cd82 (diff)
download	bcm5719-llvm-ffb132e74b26213ed3df4378c0de3128d9060aad.tar.gz bcm5719-llvm-ffb132e74b26213ed3df4378c0de3128d9060aad.zip