AMDGPU: Fix handling of alignment padding in DAG argument lowering

This was completely broken if there was ever a struct argument, as this information is thrown away during the argument analysis. The offsets as passed in to LowerFormalArguments are not useful, as they partially depend on the legalized result register type, and they don't consider the alignment in the first place. Ignore the Ins array, and instead figure out from the raw IR type what we need to do. This seems to fix the padding computation if the DAG lowering is forced (and stops breaking arguments following padded arguments if the arguments were only partially lowered in the IR) llvm-svn: 337021
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-07-13 16:40:25 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-07-13 16:40:25 +0000
commit: de950777804d0fb9ce42190e1fe28e28ec033c2b (patch)
tree: 4a122ffa5261339ce2e6fa471542c9a38fb76996 /llvm/test/CodeGen/AMDGPU
parent: 218b6a2a2ae682f0e2b42a9f76619a382b723430 (diff)
download: bcm5719-llvm-de950777804d0fb9ce42190e1fe28e28ec033c2b.tar.gz
bcm5719-llvm-de950777804d0fb9ce42190e1fe28e28ec033c2b.zip
4 files changed, 206 insertions, 21 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 5c2c868476b..9492b710d13 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -589,6 +589,17 @@ entry:
 ;   ret void
 ; }
 
+; FUNC-LABEL: {{^}}i65_arg:
+; HSA-VI: kernarg_segment_byte_size = 24
+; HSA-VI: kernarg_segment_alignment = 4
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
+entry:
+  store i65 %in, i65 addrspace(1)* %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}i1_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
@@ -651,7 +662,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
 }
 
 ; FUNC-LABEL: {{^}}empty_struct_arg:
-; HSA: kernarg_segment_byte_size = 0
+; HSA-VI: kernarg_segment_byte_size = 0
 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
   ret void
 }
@@ -667,11 +678,11 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
 
 ; FIXME: Total argument size is computed wrong
 ; FUNC-LABEL: {{^}}struct_argument_alignment:
-; HSA: kernarg_segment_byte_size = 40
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+; HSA-VI: kernarg_segment_byte_size = 40
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
   %val0 = extractvalue {i32, i64} %arg0, 0
   %val1 = extractvalue {i32, i64} %arg0, 1
@@ -687,11 +698,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
 ; No padding between i8 and next struct, but round up at end to 4 byte
 ; multiple.
 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
-; HSA: kernarg_segment_byte_size = 28
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
+; HSA-VI: kernarg_segment_byte_size = 28
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
   %val0 = extractvalue <{i32, i64}> %arg0, 0
   %val1 = extractvalue <{i32, i64}> %arg0, 1
@@ -703,3 +714,47 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
   store volatile i64 %val3, i64 addrspace(1)* null
   ret void
 }
+
+; GCN-LABEL: {{^}}struct_argument_alignment_after:
+; HSA-VI: kernarg_segment_byte_size = 64
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
+define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
+  %val0 = extractvalue {i32, i64} %arg0, 0
+  %val1 = extractvalue {i32, i64} %arg0, 1
+  %val2 = extractvalue {i32, i64} %arg2, 0
+  %val3 = extractvalue {i32, i64} %arg2, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}array_3xi32:
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
+define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
+  store volatile i16 %arg0, i16 addrspace(1)* undef
+  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
+  ret void
+}
+
+; FIXME: Why not all scalar loads?
+; GCN-LABEL: {{^}}array_3xi16:
+; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2
+; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0
+; HSA-VI: flat_load_ushort
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
new file mode 100644
index 00000000000..a1bb6c28e74
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -0,0 +1,132 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
+
+; Repeat of some problematic tests in kernel-args.ll, with the IR
+; argument lowering pass disabled. Struct padding needs to be
+; accounted for, as well as legalization of types changing offsets.
+
+; FUNC-LABEL: {{^}}i1_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
+; HSA-VI: kernarg_segment_alignment = 4
+
+; GCN: s_load_dword s
+; GCN: s_and_b32
+define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+  store i1 %x, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3i8_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
+; HSA-VI: kernarg_segment_alignment = 4
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
+define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+entry:
+  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i65_arg:
+; HSA-VI: kernarg_segment_byte_size = 24
+; HSA-VI: kernarg_segment_alignment = 4
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
+entry:
+  store i65 %in, i65 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}empty_struct_arg:
+; HSA-VI: kernarg_segment_byte_size = 0
+define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
+  ret void
+}
+
+; The correct load offsets for these:
+; load 4 from 0,
+; load 8 from 8
+; load 4 from 24
+; load 8 from 32
+
+; With the SelectionDAG argument lowering, the alignments for the
+; struct members is not properly considered, making these wrong.
+
+; FIXME: Total argument size is computed wrong
+; FUNC-LABEL: {{^}}struct_argument_alignment:
+; HSA-VI: kernarg_segment_byte_size = 40
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
+  %val0 = extractvalue {i32, i64} %arg0, 0
+  %val1 = extractvalue {i32, i64} %arg0, 1
+  %val2 = extractvalue {i32, i64} %arg1, 0
+  %val3 = extractvalue {i32, i64} %arg1, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  ret void
+}
+
+; No padding between i8 and next struct, but round up at end to 4 byte
+; multiple.
+; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
+; HSA-VI: kernarg_segment_byte_size = 28
+; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
+; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
+define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
+  %val0 = extractvalue <{i32, i64}> %arg0, 0
+  %val1 = extractvalue <{i32, i64}> %arg0, 1
+  %val2 = extractvalue <{i32, i64}> %arg1, 0
+  %val3 = extractvalue <{i32, i64}> %arg1, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}struct_argument_alignment_after:
+; HSA-VI: kernarg_segment_byte_size = 64
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
+define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
+  %val0 = extractvalue {i32, i64} %arg0, 0
+  %val1 = extractvalue {i32, i64} %arg0, 1
+  %val2 = extractvalue {i32, i64} %arg2, 0
+  %val3 = extractvalue {i32, i64} %arg2, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}array_3xi32:
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
+define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
+  store volatile i16 %arg0, i16 addrspace(1)* undef
+  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}array_3xi16:
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
index f860a122a88..6a9191e7dcb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
@@ -33,7 +33,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
 
 ; HSA: kernarg_segment_byte_size = 112
-; MESA: kernarg_segment_byte_size = 464
+; MESA: kernarg_segment_byte_size = 128
 
 ; HSA: s_load_dword s0, s[4:5], 0x1c
 define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
@@ -47,7 +47,7 @@ define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
 
 ; HSA: kernarg_segment_byte_size = 160
-; MESA: kernarg_segment_byte_size = 464
+; MESA: kernarg_segment_byte_size = 128
 
 ; HSA: s_load_dword s0, s[4:5], 0x1c
 define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
@@ -118,10 +118,10 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
 ; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: kernarg_segment_byte_size = 112
-; MESA: kernarg_segment_byte_size = 464
+; MESA: kernarg_segment_byte_size = 128
 
 ; HSA: s_add_u32 s6, s4, 0x70
-; MESA: s_add_u32 s6, s4, 0x1c0
+; MESA: s_add_u32 s6, s4, 0x70
 
 ; GCN: s_addc_u32 s7, s5, 0{{$}}
 ; GCN: s_swappc_b64
@@ -133,10 +133,9 @@ define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
 ; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:
 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: kernarg_segment_byte_size = 160
-; MESA: kernarg_segment_byte_size = 464
+; MESA: kernarg_segment_byte_size = 128
 
-; HSA: s_add_u32 s6, s4, 0x70
-; MESA: s_add_u32 s6, s4, 0x1c0
+; GCN: s_add_u32 s6, s4, 0x70
 
 ; GCN: s_addc_u32 s7, s5, 0{{$}}
 ; GCN: s_swappc_b64
@@ -219,8 +218,7 @@ define void @opencl_func_kernarg_implicitarg_ptr() #0 {
 
 ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
 ; GCN: s_mov_b64 s[6:7], s[4:5]
-; HSA: s_add_u32 s8, s6, 0x70
-; MESA: s_add_u32 s8, s6, 0x1c0
+; GCN: s_add_u32 s8, s6, 0x70
 ; GCN: s_addc_u32 s9, s7, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 6c1bc9eaa76..5853d8d8e4e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -79,7 +79,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out
 ; CO-V2: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: kernarg_segment_byte_size = 0
 ; OS-MESA3D: kernarg_segment_byte_size = 16
-; CO-V2: kernarg_segment_alignment = 32
+; CO-V2: kernarg_segment_alignment = 4
 
 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5]
 define amdgpu_kernel void @test_no_kernargs() #1 {
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-07-13 16:40:25 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-07-13 16:40:25 +0000
commit	de950777804d0fb9ce42190e1fe28e28ec033c2b (patch)
tree	4a122ffa5261339ce2e6fa471542c9a38fb76996 /llvm/test/CodeGen/AMDGPU
parent	218b6a2a2ae682f0e2b42a9f76619a382b723430 (diff)
download	bcm5719-llvm-de950777804d0fb9ce42190e1fe28e28ec033c2b.tar.gz bcm5719-llvm-de950777804d0fb9ce42190e1fe28e28ec033c2b.zip