diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-07-13 16:40:25 +0000 | 
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-07-13 16:40:25 +0000 | 
| commit | de950777804d0fb9ce42190e1fe28e28ec033c2b (patch) | |
| tree | 4a122ffa5261339ce2e6fa471542c9a38fb76996 /llvm/test/CodeGen/AMDGPU | |
| parent | 218b6a2a2ae682f0e2b42a9f76619a382b723430 (diff) | |
| download | bcm5719-llvm-de950777804d0fb9ce42190e1fe28e28ec033c2b.tar.gz bcm5719-llvm-de950777804d0fb9ce42190e1fe28e28ec033c2b.zip  | |
AMDGPU: Fix handling of alignment padding in DAG argument lowering
This was completely broken if there was ever a struct argument, as
this information is thrown away during the argument analysis.
The offsets as passed in to LowerFormalArguments are not useful,
as they partially depend on the legalized result register type,
and they don't consider the alignment in the first place.
Ignore the Ins array, and instead figure out from the raw IR type
what we need to do. This seems to fix the padding computation
if the DAG lowering is forced (and stops breaking arguments
following padded arguments if the arguments were only partially
lowered in the IR)
llvm-svn: 337021
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
4 files changed, 206 insertions, 21 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 5c2c868476b..9492b710d13 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -589,6 +589,17 @@ entry:  ;   ret void  ; } +; FUNC-LABEL: {{^}}i65_arg: +; HSA-VI: kernarg_segment_byte_size = 24 +; HSA-VI: kernarg_segment_alignment = 4 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { +entry: +  store i65 %in, i65 addrspace(1)* %out, align 4 +  ret void +} +  ; FUNC-LABEL: {{^}}i1_arg:  ; HSA-VI: kernarg_segment_byte_size = 12  ; HSA-VI: kernarg_segment_alignment = 4 @@ -651,7 +662,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi  }  ; FUNC-LABEL: {{^}}empty_struct_arg: -; HSA: kernarg_segment_byte_size = 0 +; HSA-VI: kernarg_segment_byte_size = 0  define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {    ret void  } @@ -667,11 +678,11 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {  ; FIXME: Total argument size is computed wrong  ; FUNC-LABEL: {{^}}struct_argument_alignment: -; HSA: kernarg_segment_byte_size = 40 -; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: kernarg_segment_byte_size = 40 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20  define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {    %val0 = extractvalue {i32, i64} %arg0, 0    %val1 = extractvalue {i32, i64} %arg0, 1 @@ -687,11 +698,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,  ; No padding between i8 and next struct, but round up at end to 4 byte  ; multiple.  ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: -; HSA: kernarg_segment_byte_size = 28 -; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 -; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc -; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 +; HSA-VI: kernarg_segment_byte_size = 28 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10  define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {    %val0 = extractvalue <{i32, i64}> %arg0, 0    %val1 = extractvalue <{i32, i64}> %arg0, 1 @@ -703,3 +714,47 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,    store volatile i64 %val3, i64 addrspace(1)* null    ret void  } + +; GCN-LABEL: {{^}}struct_argument_alignment_after: +; HSA-VI: kernarg_segment_byte_size = 64 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 +define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { +  %val0 = extractvalue {i32, i64} %arg0, 0 +  %val1 = extractvalue {i32, i64} %arg0, 1 +  %val2 = extractvalue {i32, i64} %arg2, 0 +  %val3 = extractvalue {i32, i64} %arg2, 1 +  store volatile i32 %val0, i32 addrspace(1)* null +  store volatile i64 %val1, i64 addrspace(1)* null +  store volatile i32 %val2, i32 addrspace(1)* null +  store volatile i64 %val3, i64 addrspace(1)* null +  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null +  ret void +} + +; GCN-LABEL: {{^}}array_3xi32: +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc +define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { +  store volatile i16 %arg0, i16 addrspace(1)* undef +  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef +  ret void +} + +; FIXME: Why not all scalar loads? +; GCN-LABEL: {{^}}array_3xi16: +; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2 +; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0 +; HSA-VI: flat_load_ushort +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { +  store volatile i8 %arg0, i8 addrspace(1)* undef +  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef +  ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll new file mode 100644 index 00000000000..a1bb6c28e74 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -0,0 +1,132 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s + +; Repeat of some problematic tests in kernel-args.ll, with the IR +; argument lowering pass disabled. Struct padding needs to be +; accounted for, as well as legalization of types changing offsets. + +; FUNC-LABEL: {{^}}i1_arg: +; HSA-VI: kernarg_segment_byte_size = 12 +; HSA-VI: kernarg_segment_alignment = 4 + +; GCN: s_load_dword s +; GCN: s_and_b32 +define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { +  store i1 %x, i1 addrspace(1)* %out, align 1 +  ret void +} + +; FUNC-LABEL: {{^}}v3i8_arg: +; HSA-VI: kernarg_segment_byte_size = 12 +; HSA-VI: kernarg_segment_alignment = 4 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { +entry: +  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 +  ret void +} + +; FUNC-LABEL: {{^}}i65_arg: +; HSA-VI: kernarg_segment_byte_size = 24 +; HSA-VI: kernarg_segment_alignment = 4 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { +entry: +  store i65 %in, i65 addrspace(1)* %out, align 4 +  ret void +} + +; FUNC-LABEL: {{^}}empty_struct_arg: +; HSA-VI: kernarg_segment_byte_size = 0 +define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { +  ret void +} + +; The correct load offsets for these: +; load 4 from 0, +; load 8 from 8 +; load 4 from 24 +; load 8 from 32 + +; With the SelectionDAG argument lowering, the alignments for the +; struct members is not properly considered, making these wrong. + +; FIXME: Total argument size is computed wrong +; FUNC-LABEL: {{^}}struct_argument_alignment: +; HSA-VI: kernarg_segment_byte_size = 40 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { +  %val0 = extractvalue {i32, i64} %arg0, 0 +  %val1 = extractvalue {i32, i64} %arg0, 1 +  %val2 = extractvalue {i32, i64} %arg1, 0 +  %val3 = extractvalue {i32, i64} %arg1, 1 +  store volatile i32 %val0, i32 addrspace(1)* null +  store volatile i64 %val1, i64 addrspace(1)* null +  store volatile i32 %val2, i32 addrspace(1)* null +  store volatile i64 %val3, i64 addrspace(1)* null +  ret void +} + +; No padding between i8 and next struct, but round up at end to 4 byte +; multiple. +; FUNC-LABEL: {{^}}packed_struct_argument_alignment: +; HSA-VI: kernarg_segment_byte_size = 28 +; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 +; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { +  %val0 = extractvalue <{i32, i64}> %arg0, 0 +  %val1 = extractvalue <{i32, i64}> %arg0, 1 +  %val2 = extractvalue <{i32, i64}> %arg1, 0 +  %val3 = extractvalue <{i32, i64}> %arg1, 1 +  store volatile i32 %val0, i32 addrspace(1)* null +  store volatile i64 %val1, i64 addrspace(1)* null +  store volatile i32 %val2, i32 addrspace(1)* null +  store volatile i64 %val3, i64 addrspace(1)* null +  ret void +} + +; GCN-LABEL: {{^}}struct_argument_alignment_after: +; HSA-VI: kernarg_segment_byte_size = 64 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 +define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { +  %val0 = extractvalue {i32, i64} %arg0, 0 +  %val1 = extractvalue {i32, i64} %arg0, 1 +  %val2 = extractvalue {i32, i64} %arg2, 0 +  %val3 = extractvalue {i32, i64} %arg2, 1 +  store volatile i32 %val0, i32 addrspace(1)* null +  store volatile i64 %val1, i64 addrspace(1)* null +  store volatile i32 %val2, i32 addrspace(1)* null +  store volatile i64 %val3, i64 addrspace(1)* null +  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null +  ret void +} + +; GCN-LABEL: {{^}}array_3xi32: +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc +define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { +  store volatile i16 %arg0, i16 addrspace(1)* undef +  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef +  ret void +} + +; GCN-LABEL: {{^}}array_3xi16: +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { +  store volatile i8 %arg0, i8 addrspace(1)* undef +  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef +  ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll index f860a122a88..6a9191e7dcb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -33,7 +33,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {  ; GCN: enable_sgpr_kernarg_segment_ptr = 1  ; HSA: kernarg_segment_byte_size = 112 -; MESA: kernarg_segment_byte_size = 464 +; MESA: kernarg_segment_byte_size = 128  ; HSA: s_load_dword s0, s[4:5], 0x1c  define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { @@ -47,7 +47,7 @@ define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {  ; GCN: enable_sgpr_kernarg_segment_ptr = 1  ; HSA: kernarg_segment_byte_size = 160 -; MESA: kernarg_segment_byte_size = 464 +; MESA: kernarg_segment_byte_size = 128  ; HSA: s_load_dword s0, s[4:5], 0x1c  define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 { @@ -118,10 +118,10 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {  ; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:  ; GCN: enable_sgpr_kernarg_segment_ptr = 1  ; HSA: kernarg_segment_byte_size = 112 -; MESA: kernarg_segment_byte_size = 464 +; MESA: kernarg_segment_byte_size = 128  ; HSA: s_add_u32 s6, s4, 0x70 -; MESA: s_add_u32 s6, s4, 0x1c0 +; MESA: s_add_u32 s6, s4, 0x70  ; GCN: s_addc_u32 s7, s5, 0{{$}}  ; GCN: s_swappc_b64 @@ -133,10 +133,9 @@ define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {  ; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:  ; GCN: enable_sgpr_kernarg_segment_ptr = 1  ; HSA: kernarg_segment_byte_size = 160 -; MESA: kernarg_segment_byte_size = 464 +; MESA: kernarg_segment_byte_size = 128 -; HSA: s_add_u32 s6, s4, 0x70 -; MESA: s_add_u32 s6, s4, 0x1c0 +; GCN: s_add_u32 s6, s4, 0x70  ; GCN: s_addc_u32 s7, s5, 0{{$}}  ; GCN: s_swappc_b64 @@ -219,8 +218,7 @@ define void @opencl_func_kernarg_implicitarg_ptr() #0 {  ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:  ; GCN: s_mov_b64 s[6:7], s[4:5] -; HSA: s_add_u32 s8, s6, 0x70 -; MESA: s_add_u32 s8, s6, 0x1c0 +; GCN: s_add_u32 s8, s6, 0x70  ; GCN: s_addc_u32 s9, s7, 0  ; GCN: s_swappc_b64  define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index 6c1bc9eaa76..5853d8d8e4e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -79,7 +79,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out  ; CO-V2: enable_sgpr_kernarg_segment_ptr = 1  ; HSA: kernarg_segment_byte_size = 0  ; OS-MESA3D: kernarg_segment_byte_size = 16 -; CO-V2: kernarg_segment_alignment = 32 +; CO-V2: kernarg_segment_alignment = 4  ; HSA: s_load_dword s{{[0-9]+}}, s[4:5]  define amdgpu_kernel void @test_no_kernargs() #1 {  | 

