AMDGPU: Try a lot harder to emit scalar loads

This has two main components. First, widen widen short constant loads in DAG when they have the correct alignment. This is already done a bit in AMDGPUCodeGenPrepare, since that has access to DivergenceAnalysis. This can't help kernarg loads created in the DAG. Start to use DAG divergence analysis to help this case. The second part is to avoid kernel argument lowering breaking the alignment of short vector elements because calling convention lowering wants to split everything into legal register types. When loading a split type, load the nearest 4-byte aligned segment and shift to get the desired bits. This extra load of the earlier argument piece ends up merging, and the bit extract hopefully folds out. There are a number of improvements and regressions with this, but I think as-is this is a better compromise between several of the worst parts of SelectionDAG. Particularly when i16 is legal, this produces worse code for i8 and i16 element vector kernel arguments. This is partially due to the very weak load merging the DAG does. It only looks for fairly specific combines between pairs of loads which no longer appear. In particular this causes v4i16 loads to be split into 2 components when previously the two halves were merged. Worse, because of the newly introduced shifts, there is a lot more unnecessary vector packing and unpacking code emitted. At least some of this is due to reporting false for isTypeDesirableForOp for i16 as a workaround for the lack of divergence information in the DAG. The cases where this happens it doesn't actually matter, but the relevant code in SimplifyDemandedBits doens't have the context to know to ignore this. The use of the scalar cache is probably more important than the mess of mostly scalar instructions doing this packing and unpacking. Future work can fix this, possibly by making better use of the new DAG divergence information for controlling promotion decisions, or adding another version of shift + trunc + shift combines that doesn't only know about the used types. llvm-svn: 334180
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-06-07 09:54:49 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-06-07 09:54:49 +0000
commit: 90083d3088ae859289a4277540dbf28b576fd59f (patch)
tree: 3b7ffdc7539d10cb975beb02e073f63d181edb16 /llvm/test/CodeGen/AMDGPU/kernel-args.ll
parent: 45dde418a9c4e075934ef124a31f5491a1082421 (diff)
download: bcm5719-llvm-90083d3088ae859289a4277540dbf28b576fd59f.tar.gz
bcm5719-llvm-90083d3088ae859289a4277540dbf28b576fd59f.zip
1 files changed, 102 insertions, 108 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index cb97d716e38..2f7d41ccf03 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -11,12 +11,10 @@
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
+
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
+
 
 define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
@@ -31,13 +29,9 @@ entry:
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
 
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
 define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
   %0 = zext i8 %in to i32
@@ -50,14 +44,12 @@ entry:
 ; HSA-VI: kernarg_segment_alignment = 4
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
+
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
 
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
+; HSA-VI: flat_store_dword
 define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
   %0 = sext i8 %in to i32
@@ -71,15 +63,13 @@ entry:
 
 ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
+
 ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
 
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
+; HSA-VI: flat_store_dword
 define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
@@ -94,13 +84,10 @@ entry:
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
 
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
+; HSA-VI: flat_store_dword
 define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
@@ -115,13 +102,11 @@ entry:
 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
-; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
-; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
-; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
-; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
-; FIXME: Should be using s_load_dword
-; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
 
+
+; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
+; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
+; HSA-VI: flat_store_dword
 define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
   %0 = sext i16 %in to i32
@@ -163,10 +148,8 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-
-; HSA: flat_load_ushort
+; GCN: s_load_dword s
+; GCN-NOT: {{buffer|flat|global}}_load_
 define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
@@ -226,15 +209,9 @@ entry:
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-
-; MESA-VI: buffer_load_ushort
-; MESA-VI: buffer_load_ubyte
 
-; HSA-VI: flat_load_ushort
-; HSA-VI: flat_load_ubyte
+; GCN: s_load_dword s
+; GCN-NOT: {{buffer|flat|global}}_load_
 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
@@ -249,8 +226,8 @@ entry:
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
 
-; GCN-DAG: s_load_dword s
-; GCN-DAG: {{buffer|flat}}_load_ushort
+; GCN: s_load_dword s
+; GCN: s_load_dword s
 define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
@@ -294,12 +271,8 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-
-; VI: s_load_dword s
+; GCN: s_load_dword s
+; GCN-NOT: {{buffer|flat|global}}_load_
 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
@@ -314,7 +287,8 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
+; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
+; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
 
 ; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
@@ -361,6 +335,7 @@ entry:
   ret void
 }
 
+; FIXME: Lots of unpack and re-pack junk on VI
 ; FUNC-LABEL: {{^}}v8i8_arg:
 ; HSA-VI: kernarg_segment_byte_size = 16
 ; HSA-VI: kernarg_segment_alignment = 4
@@ -373,16 +348,23 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
 
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dwordx2 s
+; SI-NOT: {{buffer|flat|global}}_load
+
+; VI: s_load_dword s
+; VI: s_load_dword s
+
+; VI: v_lshlrev_b16
+; VI: v_or_b32_e32
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
+; VI: v_lshlrev_b16
+; VI: s_lshr_b32
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
@@ -401,9 +383,13 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
 ; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI-NOT: {{buffer|flat|global}}_load
+
 
 ; VI: s_load_dwordx2
 ; VI: s_load_dword s
@@ -454,6 +440,8 @@ entry:
   ret void
 }
 
+; FIXME: Pack/repack on VI
+
 ; FUNC-LABEL: {{^}}v16i8_arg:
 ; HSA-VI: kernarg_segment_byte_size = 32
 ; HSA-VI: kernarg_segment_alignment = 4
@@ -474,26 +462,33 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dwordx2
+; SI-NOT: {{buffer|flat|global}}_load
 
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
-; VI: s_load_dwordx2
+
+; VI: s_load_dword s
+; VI: s_load_dword s
+; VI: s_load_dword s
+; VI: s_load_dword s
+
+; VI: s_lshr_b32
+; VI: v_lshlrev_b16
+; VI: s_lshr_b32
+; VI: s_lshr_b32
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
+; VI: v_lshlrev_b16
+; VI: v_lshlrev_b16
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
+; VI: v_lshlrev_b16
+; VI: v_lshlrev_b16
+; VI: v_or_b32_sdwa
+; VI: v_or_b32_sdwa
 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
@@ -508,6 +503,7 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
+
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
@@ -524,9 +520,13 @@ entry:
 ; SI: s_load_dword s
 ; SI: s_load_dword s
 ; SI: s_load_dword s
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+
+; SI-NOT: {{buffer|flat|global}}_load
+
 
 ; VI: s_load_dword s
 ; VI: s_load_dword s
@@ -634,10 +634,9 @@ entry:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
-; SI: buffer_load_ubyte
-; SI: v_and_b32_e32
-; SI: buffer_store_byte
-; SI: s_endpgm
+; GCN: s_load_dword s
+; GCN: s_and_b32
+; GCN: {{buffer|flat}}_store_byte
 define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   store i1 %x, i1 addrspace(1)* %out, align 1
   ret void
@@ -647,9 +646,8 @@ define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
+; GCN: s_load_dword
+; SGCN: buffer_store_dword
 define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
@@ -660,9 +658,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
-; SI: buffer_load_ubyte
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
+; GCN: s_load_dword s
+; GCN: {{buffer|flat}}_store_dwordx2
 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
@@ -673,9 +670,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
-; SI: buffer_load_ubyte
-; SI: buffer_store_dword
-; SI: s_endpgm
+; GCN: s_load_dword
+; GCN: {{buffer|flat}}_store_dword
 define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i32
   store i32 %ext, i32addrspace(1)* %out, align 4
@@ -686,11 +682,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwi
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
 
-; SI: buffer_load_ubyte
-; SI: v_bfe_i32
-; SI: v_ashrrev_i32
-; SI: buffer_store_dwordx2
-; SI: s_endpgm
+; GCN: s_load_dword
+; GCN: s_bfe_i64
+; GCN: {{buffer|flat}}_store_dwordx2
 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-06-07 09:54:49 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-06-07 09:54:49 +0000
commit	90083d3088ae859289a4277540dbf28b576fd59f (patch)
tree	3b7ffdc7539d10cb975beb02e073f63d181edb16 /llvm/test/CodeGen/AMDGPU/kernel-args.ll
parent	45dde418a9c4e075934ef124a31f5491a1082421 (diff)
download	bcm5719-llvm-90083d3088ae859289a4277540dbf28b576fd59f.tar.gz bcm5719-llvm-90083d3088ae859289a4277540dbf28b576fd59f.zip