AMDGPU: Use better alignment for kernarg lowering

This was just emitting loads with the ABI alignment for the raw type. The true alignment is often better, especially when an illegal vector type was scalarized. The better alignment allows using a scalar load more often. llvm-svn: 333558
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-05-30 16:17:51 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2018-05-30 16:17:51 +0000
commit: 7b4826e6cec09e308dda43f6c4d1dc25d0905b7a (patch)
tree: 9067dff092c8b2892e529d71f88f2454d064f836 /llvm/test
parent: ebaaa2ddae28edd5a29eeec3a7eb927da11f4fb6 (diff)
download: bcm5719-llvm-7b4826e6cec09e308dda43f6c4d1dc25d0905b7a.tar.gz
bcm5719-llvm-7b4826e6cec09e308dda43f6c4d1dc25d0905b7a.zip
4 files changed, 94 insertions, 142 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 1ece288cd5e..d7141efc82d 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -88,11 +88,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(<2 x half> addrspace(1)*
 ; Combine turns this into integer op when bitcast source (from load)
 
 ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16_bc_src:
-; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
-; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
 
 ; FIXME: Random commute
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
 define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %out, <2 x half> %in) {
@@ -103,16 +101,12 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %ou
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
-; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
-; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}}
-; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
-; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}}
-; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
 
 ; FIXME: Random commute
-; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
+; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
+
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
 
 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index a042700edf8..f31b2ab5563 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -13,17 +13,10 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
   ret void
 }
 
-; FIXME: Should always be the same
 ; GCN-LABEL: {{^}}load_v2f16_arg:
-; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
-; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
-; SI: v_or_b32_e32 [[PACKED:v[0-9]+]],  [[V0]], [[HI]]
-; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-
-; VI: s_load_dword [[ARG:s[0-9]+]]
-; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
-; VI: buffer_store_dword [[V_ARG]]
+; GCN: s_load_dword [[ARG:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
+; GCN: buffer_store_dword [[V_ARG]]
 define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
   store <2 x half> %arg, <2 x half> addrspace(1)* %out
   ret void
@@ -31,8 +24,8 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha
 
 ; GCN-LABEL: {{^}}load_v3f16_arg:
 ; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: s_load_dword s
+
 ; GCN-NOT: buffer_load
 ; GCN-DAG: buffer_store_dword
 ; GCN-DAG: buffer_store_short
@@ -43,19 +36,14 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
   ret void
 }
 
-; GCN-LABEL: {{^}}load_v4f16_arg:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_store_dwordx2
 
 ; FIXME: Why not one load?
-; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
-; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
+; GCN-LABEL: {{^}}load_v4f16_arg:
+; GCN-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
+; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
 define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
   store <4 x half> %arg, <4 x half> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index f51366f2665..cb97d716e38 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -162,10 +162,11 @@ entry:
 
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; HSA: flat_load_ushort
 define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
@@ -179,10 +180,9 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-
-; VI: s_load_dword s
+; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
+; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
@@ -226,11 +226,14 @@ entry:
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; MESA-VI: buffer_load_ushort
+; MESA-VI: buffer_load_ubyte
+
+; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ubyte
 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
@@ -245,12 +248,9 @@ entry:
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
-; MESA-GCN: buffer_load_ushort
-; MESA-GCN: buffer_load_ushort
-; MESA-GCN: buffer_load_ushort
-; HSA-VI: flat_load_ushort
-; HSA-VI: flat_load_ushort
-; HSA-VI: flat_load_ushort
+
+; GCN-DAG: s_load_dword s
+; GCN-DAG: {{buffer|flat}}_load_ushort
 define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
@@ -293,14 +293,13 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; VI: s_load_dword s
 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
@@ -315,13 +314,14 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
 
-; VI: s_load_dword s
-; VI: s_load_dword s
+; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30
+
+; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8
+; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
@@ -372,21 +372,17 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; VI: s_load_dwordx2
+; VI: s_load_dwordx2
 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
@@ -405,15 +401,11 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
 
+; VI: s_load_dwordx2
 ; VI: s_load_dword s
 ; VI: s_load_dword s
 ; VI: s_load_dword s
@@ -481,38 +473,27 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; VI: s_load_dwordx2
+; VI: s_load_dwordx2
+; VI: s_load_dwordx2
 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
@@ -539,22 +520,13 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
 
 ; VI: s_load_dword s
 ; VI: s_load_dword s
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
index 601aca48e1e..f2f845b86db 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -39,10 +39,8 @@ define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)*
 }
 
 ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: s_load_dword s
+; GCN: s_load_dwordx2 s
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
 define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
   %x.bc = bitcast <4 x i16> %x to <2 x i32>
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-05-30 16:17:51 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2018-05-30 16:17:51 +0000
commit	7b4826e6cec09e308dda43f6c4d1dc25d0905b7a (patch)
tree	9067dff092c8b2892e529d71f88f2454d064f836 /llvm/test
parent	ebaaa2ddae28edd5a29eeec3a7eb927da11f4fb6 (diff)
download	bcm5719-llvm-7b4826e6cec09e308dda43f6c4d1dc25d0905b7a.tar.gz bcm5719-llvm-7b4826e6cec09e308dda43f6c4d1dc25d0905b7a.zip