diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-06-07 09:54:49 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-06-07 09:54:49 +0000 |
commit | 90083d3088ae859289a4277540dbf28b576fd59f (patch) | |
tree | 3b7ffdc7539d10cb975beb02e073f63d181edb16 /llvm/test/CodeGen/AMDGPU | |
parent | 45dde418a9c4e075934ef124a31f5491a1082421 (diff) | |
download | bcm5719-llvm-90083d3088ae859289a4277540dbf28b576fd59f.tar.gz bcm5719-llvm-90083d3088ae859289a4277540dbf28b576fd59f.zip |
AMDGPU: Try a lot harder to emit scalar loads
This has two main components. First, widen
widen short constant loads in DAG when they have
the correct alignment. This is already done a bit in
AMDGPUCodeGenPrepare, since that has access to
DivergenceAnalysis. This can't help kernarg loads
created in the DAG. Start to use DAG divergence analysis
to help this case.
The second part is to avoid kernel argument lowering
breaking the alignment of short vector elements because
calling convention lowering wants to split everything
into legal register types.
When loading a split type, load the nearest 4-byte aligned
segment and shift to get the desired bits. This extra
load of the earlier argument piece ends up merging,
and the bit extract hopefully folds out.
There are a number of improvements and regressions with
this, but I think as-is this is a better compromise between
several of the worst parts of SelectionDAG.
Particularly when i16 is legal, this produces worse code
for i8 and i16 element vector kernel arguments. This is
partially due to the very weak load merging the DAG does.
It only looks for fairly specific combines between pairs
of loads which no longer appear. In particular this
causes v4i16 loads to be split into 2 components when
previously the two halves were merged.
Worse, because of the newly introduced shifts, there
is a lot more unnecessary vector packing and unpacking code
emitted. At least some of this is due to reporting
false for isTypeDesirableForOp for i16 as a workaround for
the lack of divergence information in the DAG. The cases
where this happens it doesn't actually matter, but the
relevant code in SimplifyDemandedBits doens't have the context
to know to ignore this.
The use of the scalar cache is probably more important
than the mess of mostly scalar instructions doing this packing
and unpacking. Future work can fix this, possibly by making better
use of the new DAG divergence information for controlling promotion
decisions, or adding another version of shift + trunc + shift
combines that doesn't only know about the used types.
llvm-svn: 334180
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
26 files changed, 714 insertions, 462 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index c320e0609c2..b5b3e78a142 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -176,9 +176,13 @@ define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { ret void } -; FIXME: Should use SGPRs ; FUNC-LABEL: {{^}}s_and_i1: -; SI: v_and_b32 +; SI: s_load_dword [[LOAD:s[0-9]+]] +; SI: s_lshr_b32 [[B_SHIFT:s[0-9]+]], [[LOAD]], 8 +; SI: s_and_b32 [[AND:s[0-9]+]], [[LOAD]], [[B_SHIFT]] +; SI: s_and_b32 [[AND_TRUNC:s[0-9]+]], [[AND]], 1{{$}} +; SI: v_mov_b32_e32 [[V_AND_TRUNC:v[0-9]+]], [[AND_TRUNC]] +; SI: buffer_store_byte [[V_AND_TRUNC]] define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) { %and = and i1 %a, %b store i1 %and, i1 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll index 6d3878c8ab2..14327e30aff 100644 --- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -2,14 +2,16 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; FIXME: Should be same on CI/VI ; GCN-LABEL: {{^}}s_ashr_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] ; GFX9: s_load_dword [[RHS:s[0-9]+]] ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: s_load_dword [[LHS:s[0-9]+]] -; VI: s_load_dword [[RHS:s[0-9]+]] +; CIVI: s_load_dword [[LHS:s[0-9]+]] +; CIVI: s_load_dword [[RHS:s[0-9]+]] + ; VI: s_ashr_i32 ; VI: s_ashr_i32 ; VI: s_sext_i32_i16 @@ -20,11 +22,14 @@ ; VI: s_and_b32 ; VI: s_or_b32 -; CI-DAG: v_ashrrev_i32_e32 -; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; CI-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_or_b32_e32 +; CI: s_ashr_i32 +; CI: s_and_b32 +; CI: s_lshr_b32 +; CI: s_sext_i32_i16 +; CI: s_ashr_i32 +; CI: s_ashr_i32 +; CI: s_lshl_b32 +; CI: s_and_b32 define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = ashr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll index d8f9e4f51ff..0607eca9cd1 100644 --- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll +++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll @@ -1,7 +1,7 @@ -; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s -; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s +; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNNOOPT -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCNOPT -check-prefix=GCN %s ; GCN-LABEL: {{^}}test_branch: ; GCNNOOPT: v_writelane_b32 @@ -28,10 +28,11 @@ end: } ; GCN-LABEL: {{^}}test_brcc_i1: -; GCN: buffer_load_ubyte -; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, -; GCN: v_cmp_eq_u32_e32 vcc, -; GCN: s_cbranch_vccnz [[END:BB[0-9]+_[0-9]+]] +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]] +; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1 +; GCN: s_cmp_eq_u32 +; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] ; GCN: buffer_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index 4872fbfadab..03e7e74c7c2 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -105,8 +105,9 @@ for.body: ; GCN: s_cbranch_vccnz [[LOOPBB]] ; GCN-NEXT: ; %bb.2 ; GCN-NEXT: s_endpgm -define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind { +define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind { entry: + %cond = load volatile i1, i1 addrspace(3)* null br label %for.body for.exit: diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 50476e8e754..067f93a54c8 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -58,9 +58,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace( } ; GCN-LABEL: {{^}}extract_vector_elt_v3f16: -; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: s_load_dword s +; GCN: s_load_dword s define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 { %p0 = extractelement <3 x half> %foo, i32 0 %p1 = extractelement <3 x half> %foo, i32 2 @@ -70,10 +69,11 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 ret void } +; FIXME: Why sometimes vector shift? ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16: -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s ; GFX9-DAG: global_load_short_d16_hi v ; GFX9-DAG: global_load_short_d16 v @@ -81,7 +81,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 ; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v -; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index 9c8f04fdfd9..c5f020fee7e 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,GFX89 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,GFX89 %s ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}extract_vector_elt_v2i16: @@ -58,7 +58,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1 } ; GCN-LABEL: {{^}}extract_vector_elt_v3i16: -; GCN: buffer_load_ushort +; GCN: s_load_dword s +; GCN: s_load_dword s ; GCN: buffer_store_short ; GCN: buffer_store_short define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 { @@ -71,10 +72,10 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x } ; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; SICI: buffer_load_ushort -; SICI: buffer_load_ushort -; SICI: buffer_store_short -; SICI: buffer_store_short +; SI: s_load_dword s +; SI: s_load_dword s +; SI: buffer_store_short +; SI: buffer_store_short ; VI: s_load_dword s ; VI: s_load_dword s @@ -97,24 +98,19 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: -; SICI: buffer_load_ushort -; SICI: buffer_load_ushort -; SICI: buffer_load_ushort +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: {{buffer|flat|global}} -; SICI: buffer_store_short -; SICI: buffer_store_short -; SICI: buffer_store_short - -; SICI: buffer_load_ushort -; SICI: buffer_store_short - -; GFX9-DAG: global_load_short_d16_hi v -; GFX9-DAG: global_load_short_d16 v +; FIXME: Unnecessary repacking +; GFX9: s_pack_ll_b32_b16 +; GFX9: s_pack_lh_b32_b16 ; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 -; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v -; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} + +; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s ; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll index 1d6349f887a..ce310b8c0a4 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -2,8 +2,9 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}extract_vector_elt_v1i8: -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte +; GCN: s_load_dword [[LOAD:s[0-9]+]] +; GCN: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]] +; GCN: buffer_store_byte [[V_LOAD]] define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 { %p0 = extractelement <1 x i8> %foo, i32 0 store i8 %p0, i8 addrspace(1)* %out @@ -11,8 +12,10 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i } ; GCN-LABEL: {{^}}extract_vector_elt_v2i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dword s +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8 +; GCN-NOT: {{flat|buffer|global}} ; GCN: buffer_store_byte ; GCN: buffer_store_byte define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 { @@ -25,8 +28,10 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i } ; GCN-LABEL: {{^}}extract_vector_elt_v3i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dword s +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; GCN-NOT: {{flat|buffer|global}} ; GCN: buffer_store_byte ; GCN: buffer_store_byte define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 { @@ -39,8 +44,10 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i } ; GCN-LABEL: {{^}}extract_vector_elt_v4i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dword s +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; GCN-NOT: {{flat|buffer|global}} ; GCN: buffer_store_byte ; GCN: buffer_store_byte define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 { @@ -53,8 +60,10 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i } ; GCN-LABEL: {{^}}extract_vector_elt_v8i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16 +; GCN-NOT: {{flat|buffer|global}} ; GCN: buffer_store_byte ; GCN: buffer_store_byte define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 { @@ -67,10 +76,13 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i } ; GCN-LABEL: {{^}}extract_vector_elt_v16i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte +; GCN: s_load_dword [[LOAD0:s[0-9]+]] +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16 +; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]] +; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] +; GCN: buffer_store_byte [[V_ELT2]] +; GCN: buffer_store_byte [[V_LOAD0]] define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 { %p0 = extractelement <16 x i8> %foo, i32 0 %p1 = extractelement <16 x i8> %foo, i32 2 @@ -81,10 +93,13 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x } ; GCN-LABEL: {{^}}extract_vector_elt_v32i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte +; GCN: s_load_dword [[LOAD0:s[0-9]+]] +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16 +; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]] +; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] +; GCN: buffer_store_byte [[V_ELT2]] +; GCN: buffer_store_byte [[V_LOAD0]] define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 { %p0 = extractelement <32 x i8> %foo, i32 0 %p1 = extractelement <32 x i8> %foo, i32 2 @@ -95,10 +110,13 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x } ; GCN-LABEL: {{^}}extract_vector_elt_v64i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte -; GCN: buffer_store_byte +; GCN: s_load_dword [[LOAD0:s[0-9]+]] +; GCN-NOT: {{flat|buffer|global}} +; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16 +; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]] +; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] +; GCN: buffer_store_byte [[V_ELT2]] +; GCN: buffer_store_byte [[V_LOAD0]] define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 { %p0 = extractelement <64 x i8> %foo, i32 0 %p1 = extractelement <64 x i8> %foo, i32 2 @@ -110,12 +128,19 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x ; FIXME: SI generates much worse code from that's a pain to match -; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: -; VI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]], -; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; FIXME: 16-bit and 32-bit shift not combined after legalize to to +; isTypeDesirableForOp in SimplifyDemandedBits -; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[LOAD]] +; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: +; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; VI-NOT: {{flat|buffer|global}} +; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8 +; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]] +; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} +; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]] +; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]] ; VI: buffer_store_byte [[EXTRACT]] define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 { %elt = extractelement <2 x i8> %foo, i32 %idx @@ -124,14 +149,14 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: -; VI-DAG: buffer_load_ubyte [[LOAD2:v[0-9]+]], -; VI-DAG: buffer_load_ushort [[LOAD01:v[0-9]+]], -; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 - +; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; VI-NOT: {{flat|buffer|global}} +; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8 +; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]] +; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} +; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]] ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 - -; VI: v_lshlrev_b32_e32 [[ELT2:v[0-9]+]], 16, [[LOAD2]] -; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[LOAD01]], [[ELT2]] ; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]] ; VI: buffer_store_byte [[EXTRACT]] define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 { @@ -142,30 +167,33 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8: -; VI-DAG: s_load_dword [[VEC3:s[0-9]+]], s[0:1], 0x2c -; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 +; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34 +; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC3]], [[SCALED_IDX]] +; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC4]], [[SCALED_IDX]] + ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]] ; VI: buffer_store_byte [[V_EXTRACT]] -define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 { - %p0 = extractelement <4 x i8> %foo, i32 %idx +define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 { + %vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr + %p0 = extractelement <4 x i8> %vec, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 %p0, i8 addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8: -; VI-DAG: s_load_dwordx2 [[VEC3:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c -; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34 +; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34 +; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC3]], [[SCALED_IDX]] +; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC8]], [[SCALED_IDX]] ; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]] ; VI: buffer_store_byte [[V_EXTRACT]] -define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo, i32 %idx) #0 { - %p0 = extractelement <8 x i8> %foo, i32 %idx +define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 { + %vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr + %p0 = extractelement <8 x i8> %vec, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 store i8 %p0, i8 addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 3f1ffc7b70b..e6223983222 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -1,16 +1,21 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; DAGCombiner will transform: ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) ; unless isFabsFree returns true ; GCN-LABEL: {{^}}s_fabs_free_f16: -; GCN: {{flat|global}}_load_ushort [[VAL:v[0-9]+]], -; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]] -; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GCN: s_load_dword [[VAL:s[0-9]+]] + +; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff +; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]] +; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] +; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff +; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]] +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc= bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) @@ -19,9 +24,15 @@ define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { } ; GCN-LABEL: {{^}}s_fabs_f16: -; CI: flat_load_ushort [[VAL:v[0-9]+]], -; CI: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]] -; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GCN: s_load_dword [[VAL:s[0-9]+]] + +; CI: s_and_b32 [[RESULT:s[0-9]+]], [[VAL]], 0x7fff +; CI: v_mov_b32_e32 [[V_RESULT:v[0-9]+]], [[RESULT]] +; CI: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] + +; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff +; GFX89: v_and_b32_e32 [[V_RESULT:v[0-9]+]], [[VAL]], [[MASK]] +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_RESULT]] define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) store half %fabs, half addrspace(1)* %out @@ -43,7 +54,6 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] - ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) @@ -52,18 +62,18 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half } ; GCN-LABEL: {{^}}fabs_fold_f16: -; GCN: {{flat|global}}_load_ushort [[IN0:v[0-9]+]] -; GCN: {{flat|global}}_load_ushort [[IN1:v[0-9]+]] +; GCN: s_load_dword [[IN0:s[0-9]+]] +; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16 -; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]] -; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]| -; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[ABS_CVT1]], [[CVT0]] -; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] +; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]| +; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]] +; CI-DAG: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[CVT0]], [[ABS_CVT1]] +; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]] -; VI-NOT: and -; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]] +; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]] +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { %fabs = call half @llvm.fabs.f16(half %in0) %fmul = fmul half %fabs, %in1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index bb6a1643ac0..826fdb2e362 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -3,12 +3,12 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: -; CI: v_cvt_f32_f16_e32 -; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}| +; CI-DAG: v_cvt_f32_f16_e32 +; CI-DAG: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |s{{[0-9]+}}| ; CI: v_sub_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CVT_ABS_X]] ; GFX89-NOT: _and -; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| +; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) %fsub = fsub half -0.0, %fabs @@ -19,12 +19,12 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16: ; CI-DAG: v_cvt_f32_f16_e32 -; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}| +; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{s[0-9]+}}| ; CI: v_mul_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, [[CVT_NEG_ABS_X]] ; CI: v_cvt_f16_f32_e32 ; GFX89-NOT: _and -; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}| +; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{s[0-9]+}}, -|{{v[0-9]+}}| ; GFX89-NOT: [[MUL]] ; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { @@ -40,7 +40,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, ; unless isFabsFree returns true ; GCN-LABEL: {{^}}fneg_fabs_free_f16: -; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000 define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc = bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) @@ -50,7 +50,7 @@ define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) } ; GCN-LABEL: {{^}}fneg_fabs_f16: -; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000 define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) %fsub = fsub half -0.0, %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 34f464ac4d9..0dc8544640a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -28,13 +28,17 @@ define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1) ret void } -; GCN-LABEL: {{^}}fneg_free_f16: -; GCN: {{flat|global}}_load_ushort [[NEG_VALUE:v[0-9]+]], +; GCN-LABEL: {{^}}s_fneg_free_f16: +; GCN: s_load_dword [[NEG_VALUE:s[0-9]+]], -; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} -; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]] -; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] -define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { +; CI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} +; CI: v_mov_b32_e32 [[V_XOR:v[0-9]+]], [[XOR]] +; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_XOR]] + +; GFX89: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000 +; GFX89: v_xor_b32_e32 [[XOR:v[0-9]+]], [[NEG_VALUE]], [[MASK]] +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] +define amdgpu_kernel void @s_fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { %bc = bitcast i16 %in to half %fsub = fsub half -0.0, %bc store half %fsub, half addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 453d8fc387d..862869a3cfe 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -4,9 +4,9 @@ ; half args should be promoted to float for SI and lower. ; GCN-LABEL: {{^}}load_f16_arg: -; GCN: flat_load_ushort [[ARG:v[0-9]+]] -; GCN-NOT: [[ARG]] -; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ARG]] +; GCN: s_load_dword [[ARG:s[0-9]+]] +; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]] +; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V_ARG]] define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { store half %arg, half addrspace(1)* %out ret void @@ -22,8 +22,9 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha } ; GCN-LABEL: {{^}}load_v3f16_arg: -; GCN: flat_load_ushort ; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: {buffer|flat|global}}_load_ ; GCN-NOT: _load ; GCN-DAG: _store_dword @@ -76,10 +77,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: -; GCN: flat_load_ushort -; GCN: flat_load_ushort -; GCN: flat_load_ushort -; GCN-NOT: {{buffer|flat|global}}_load +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: _load ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -101,20 +101,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort -; SI: flat_load_ushort - - -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -134,7 +124,7 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* } ; GCN-LABEL: {{^}}extload_f16_to_f64_arg: -; GCN: flat_load_ushort [[ARG:v[0-9]+]] +; GCN: s_load_dword [[ARG:s[0-9]+]] ; GCN: v_cvt_f32_f16_e32 v[[ARG_F32:[0-9]+]], [[ARG]] ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[ARG_F32]] ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] @@ -145,11 +135,8 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, hal } ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: -; SI-DAG: flat_load_ushort v -; SI-DAG: flat_load_ushort v - -; VI-DAG: s_load_dword s -; VI: s_lshr_b32 +; GCN: s_load_dword +; GCN: s_lshr_b32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -163,9 +150,10 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: -; GCN-DAG: flat_load_ushort v -; GCN-DAG: flat_load_ushort v -; GCN-DAG: flat_load_ushort v +; GCN: s_load_dword +; GCN: s_load_dword +; GCN: s_lshr_b32 + ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -180,13 +168,8 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v - -; VI: s_load_dword s -; VI: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -204,23 +187,11 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v - -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v -; SI: flat_load_ushort v - - -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s -; VI: s_load_dword s - - +; GCN: s_load_dword s +; GCN-NEXT: s_load_dword s +; GCN-NEXT: s_load_dword s +; GCN-NEXT: s_load_dword s +; GCN-NOT: _load_ ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -237,10 +208,10 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* ; GCN-DAG: v_cvt_f64_f32_e32 ; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 -; GCN-DAG: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 ; GCN: s_endpgm define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll index 82f72140e75..dcf3b36be2b 100644 --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -124,8 +124,8 @@ define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) { } ; GCN-LABEL: {{^}}add_inline_imm_0.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0.0 @@ -134,8 +134,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}add_inline_imm_0.5_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0.5 @@ -144,8 +144,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -0.5 @@ -154,8 +154,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, h } ; GCN-LABEL: {{^}}add_inline_imm_1.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 1.0 @@ -164,8 +164,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -1.0 @@ -174,8 +174,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, h } ; GCN-LABEL: {{^}}add_inline_imm_2.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 2.0 @@ -184,8 +184,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -2.0 @@ -194,8 +194,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, h } ; GCN-LABEL: {{^}}add_inline_imm_4.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 4.0 @@ -204,8 +204,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, -4.0 @@ -236,8 +236,8 @@ define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half } ; GCN-LABEL: {{^}}add_inline_imm_1_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0001 @@ -246,8 +246,8 @@ define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x } ; GCN-LABEL: {{^}}add_inline_imm_2_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0002 @@ -256,8 +256,8 @@ define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x } ; GCN-LABEL: {{^}}add_inline_imm_16_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}} +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 16{{$}} ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0010 @@ -268,9 +268,9 @@ define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half % ; GCN-LABEL: {{^}}add_inline_imm_neg_1_f16: ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, -1 ; VI: buffer_store_short [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) { - %xbc = bitcast half %x to i16 - %y = add i16 %xbc, -1 +define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) { + %x = load i16, i16 addrspace(1)* %in + %y = add i16 %x, -1 %ybc = bitcast i16 %y to half store half %ybc, half addrspace(1)* %out ret void @@ -279,9 +279,9 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, hal ; GCN-LABEL: {{^}}add_inline_imm_neg_2_f16: ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, 0xfffe ; VI: buffer_store_short [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) { - %xbc = bitcast half %x to i16 - %y = add i16 %xbc, -2 +define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) { + %x = load i16, i16 addrspace(1)* %in + %y = add i16 %x, -2 %ybc = bitcast i16 %y to half store half %ybc, half addrspace(1)* %out ret void @@ -290,17 +290,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, hal ; GCN-LABEL: {{^}}add_inline_imm_neg_16_f16: ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, 0xfff0 ; VI: buffer_store_short [[REG]] -define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) { - %xbc = bitcast half %x to i16 - %y = add i16 %xbc, -16 +define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, i16 addrspace(1)* %in) { + %x = load i16, i16 addrspace(1)* %in + %y = add i16 %x, -16 %ybc = bitcast i16 %y to half store half %ybc, half addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}add_inline_imm_63_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 63 ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH003F @@ -309,8 +309,8 @@ define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half % } ; GCN-LABEL: {{^}}add_inline_imm_64_f16: -; VI: buffer_load_ushort [[VAL:v[0-9]+]] -; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: v_add_f16_e64 [[REG:v[0-9]+]], [[VAL]], 64 ; VI: buffer_store_short [[REG]] define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) { %y = fadd half %x, 0xH0040 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index dab1c6138c6..ea3d466b9bb 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -202,14 +202,21 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* % } ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8: -; VI: buffer_load_ushort [[LOAD:v[0-9]]] -; VI: s_load_dword [[IDX:s[0-9]]] +; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI-NOT: _load +; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8 ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI: v_lshlrev_b16_e64 [[SHL:v[0-9]+]], [[SCALED_IDX]], -1 -; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[SHL]] -; VI: v_and_b32_e32 [[AND0:v[0-9]+]], 5, [[SHL]] -; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[LOAD]] -; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[AND0]], [[AND1]] +; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]] +; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} +; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1 + +; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]] +; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]] + +; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]] +; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]] +; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]] ; VI: buffer_store_short [[OR]] define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { %vecins = insertelement <2 x i8> %a, i8 5, i32 %b @@ -217,17 +224,32 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou ret void } +; FIXME: post legalize i16 and i32 shifts aren't merged because of +; isTypeDesirableForOp in SimplifyDemandedBits + ; GCN-LABEL: {{^}}dynamic_insertelement_v3i8: -; VI: buffer_load_ubyte -; VI: buffer_load_ushort -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 3 -; VI: s_lshl_b32 s{{[0-9]+}}, 0xffff, -; VI: s_not_b32 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; VI: v_or_b32_e32 -; VI: v_and_b32 -; VI: v_bfi_b32 -; VI: v_lshrrev_b32 +; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI-NOT: _load + +; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8 +; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]] +; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}} +; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]] +; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}} + +; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}} +; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]] + +; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]] +; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]] +; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]] +; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]] +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]] +; VI: buffer_store_short [[BFI]] +; VI: buffer_store_byte [[HI2]] define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind { %vecins = insertelement <3 x i8> %a, i8 5, i32 %b store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 @@ -235,21 +257,37 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou } ; GCN-LABEL: {{^}}dynamic_insertelement_v4i8: -; VI: s_load_dword [[VEC:s[0-9]+]] -; VI: s_load_dword [[IDX:s[0-9]]] +; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI-NOT: _load + +; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8 +; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]] +; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}} + + +; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24 +; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16 +; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]] +; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 -; VI-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI-DAG: v_mov_b32_e32 [[V_VEC:v[0-9]+]], [[VEC]] -; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[MASK]], 5, [[V_VEC]] +; VI: v_or_b32_sdwa +; VI: s_lshl_b32 +; VI: v_bfi_b32 define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 ret void } -; GCN-LABEL: {{^}}dynamic_insertelement_v8i8: -; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8: +; VI-NOT: {{buffer|flat|global}} ; VI: s_load_dword [[IDX:s[0-9]]] +; VI-NOT: {{buffer|flat|global}} +; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; VI-NOT: {{buffer|flat|global}} + ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 ; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0 ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff @@ -261,29 +299,22 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]] ; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]] ; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}} -define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { +define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { + %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 ret void } ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; GCN: s_load_dwordx2 +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: _load_ + ; GCN: buffer_store_byte ; GCN: buffer_store_byte diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index cb97d716e38..2f7d41ccf03 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -11,12 +11,10 @@ ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} + +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff + define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { entry: @@ -31,13 +29,9 @@ entry: ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { entry: %0 = zext i8 %in to i32 @@ -50,14 +44,12 @@ entry: ; HSA-VI: kernarg_segment_alignment = 4 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb + ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] +; HSA-VI: flat_store_dword define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { entry: %0 = sext i8 %in to i32 @@ -71,15 +63,13 @@ entry: ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb + ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} +; HSA-VI: flat_store_dword define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { entry: %0 = zext i16 %in to i32 @@ -94,13 +84,10 @@ entry: ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} +; HSA-VI: flat_store_dword define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { entry: %0 = zext i16 %in to i32 @@ -115,13 +102,11 @@ entry: ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c -; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8 -; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0 -; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]] -; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]] -; FIXME: Should be using s_load_dword -; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}} + +; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 +; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] +; HSA-VI: flat_store_dword define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { entry: %0 = sext i16 %in to i32 @@ -163,10 +148,8 @@ entry: ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte - -; HSA: flat_load_ushort +; GCN: s_load_dword s +; GCN-NOT: {{buffer|flat|global}}_load_ define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) { entry: store <2 x i8> %in, <2 x i8> addrspace(1)* %out @@ -226,15 +209,9 @@ entry: ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte - -; MESA-VI: buffer_load_ushort -; MESA-VI: buffer_load_ubyte -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ubyte +; GCN: s_load_dword s +; GCN-NOT: {{buffer|flat|global}}_load_ define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { entry: store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 @@ -249,8 +226,8 @@ entry: ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 -; GCN-DAG: s_load_dword s -; GCN-DAG: {{buffer|flat}}_load_ushort +; GCN: s_load_dword s +; GCN: s_load_dword s define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { entry: store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4 @@ -294,12 +271,8 @@ entry: ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte - -; VI: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: {{buffer|flat|global}}_load_ define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(1)* %out @@ -314,7 +287,8 @@ entry: ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb +; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 ; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c @@ -361,6 +335,7 @@ entry: ret void } +; FIXME: Lots of unpack and re-pack junk on VI ; FUNC-LABEL: {{^}}v8i8_arg: ; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 @@ -373,16 +348,23 @@ entry: ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; VI: s_load_dwordx2 -; VI: s_load_dwordx2 +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dwordx2 s +; SI-NOT: {{buffer|flat|global}}_load + +; VI: s_load_dword s +; VI: s_load_dword s + +; VI: v_lshlrev_b16 +; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa +; VI: v_lshlrev_b16 +; VI: s_lshr_b32 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { entry: store <8 x i8> %in, <8 x i8> addrspace(1)* %out @@ -401,9 +383,13 @@ entry: ; EG: VTX_READ_16 ; EG: VTX_READ_16 +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s ; SI: s_load_dwordx2 -; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI-NOT: {{buffer|flat|global}}_load + ; VI: s_load_dwordx2 ; VI: s_load_dword s @@ -454,6 +440,8 @@ entry: ret void } +; FIXME: Pack/repack on VI + ; FUNC-LABEL: {{^}}v16i8_arg: ; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 @@ -474,26 +462,33 @@ entry: ; EG: VTX_READ_8 ; EG: VTX_READ_8 -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dwordx2 +; SI-NOT: {{buffer|flat|global}}_load -; VI: s_load_dwordx2 -; VI: s_load_dwordx2 -; VI: s_load_dwordx2 + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s + +; VI: s_lshr_b32 +; VI: v_lshlrev_b16 +; VI: s_lshr_b32 +; VI: s_lshr_b32 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa +; VI: v_lshlrev_b16 +; VI: v_lshlrev_b16 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa +; VI: v_lshlrev_b16 +; VI: v_lshlrev_b16 +; VI: v_or_b32_sdwa +; VI: v_or_b32_sdwa define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) { entry: store <16 x i8> %in, <16 x i8> addrspace(1)* %out @@ -508,6 +503,7 @@ entry: ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 + ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -524,9 +520,13 @@ entry: ; SI: s_load_dword s ; SI: s_load_dword s ; SI: s_load_dword s -; SI: s_load_dwordx2 -; SI: s_load_dwordx2 -; SI: s_load_dwordx2 +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s +; SI: s_load_dword s + +; SI-NOT: {{buffer|flat|global}}_load + ; VI: s_load_dword s ; VI: s_load_dword s @@ -634,10 +634,9 @@ entry: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: v_and_b32_e32 -; SI: buffer_store_byte -; SI: s_endpgm +; GCN: s_load_dword s +; GCN: s_and_b32 +; GCN: {{buffer|flat}}_store_byte define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { store i1 %x, i1 addrspace(1)* %out, align 1 ret void @@ -647,9 +646,8 @@ define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm +; GCN: s_load_dword +; SGCN: buffer_store_dword define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { %ext = zext i1 %x to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 @@ -660,9 +658,8 @@ define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwi ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: buffer_store_dwordx2 -; SI: s_endpgm +; GCN: s_load_dword s +; GCN: {{buffer|flat}}_store_dwordx2 define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = zext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 @@ -673,9 +670,8 @@ define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwi ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: buffer_store_dword -; SI: s_endpgm +; GCN: s_load_dword +; GCN: {{buffer|flat}}_store_dword define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i32 store i32 %ext, i32addrspace(1)* %out, align 4 @@ -686,11 +682,9 @@ define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwi ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; SI: buffer_load_ubyte -; SI: v_bfe_i32 -; SI: v_ashrrev_i32 -; SI: buffer_store_dwordx2 -; SI: s_endpgm +; GCN: s_load_dword +; GCN: s_bfe_i64 +; GCN: {{buffer|flat}}_store_dwordx2 define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind { %ext = sext i1 %x to i64 store i64 %ext, i64 addrspace(1)* %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll index 5e9fec13ac4..0fdbb6d0115 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -3,8 +3,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s ; GCN-LABEL: {{^}}buffer_store_format_d16_x: -; GCN: {{buffer|flat|global}}_load_ushort v[[LO:[0-9]+]] -; GCN: buffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; GCN: s_load_dword s[[LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] +; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) { main_body: call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll index 91cddb1e061..ad5d3f58a07 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll @@ -24,9 +24,10 @@ entry: } ; GCN-LABEL: {{^}}class_f16_fabs: -; GCN-DAG: buffer_load_ushort v[[SA_F16:[0-9]+]] -; GCN-DAG: s_load_dword s[[SB_I32:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |v[[SA_F16]]|, s[[SB_I32]] +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |s[[SA_F16]]|, [[V_B_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -42,10 +43,11 @@ entry: ret void } -; GCN-LABEL: {{^}}class_f16_fneg -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] +; GCN-LABEL: {{^}}class_f16_fneg: +; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; GCN: s_load_dword s[[SB_I32:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -v[[SA_F16]], s[[SB_I32]] +; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -s[[SA_F16]], [[V_B_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -61,10 +63,11 @@ entry: ret void } -; GCN-LABEL: {{^}}class_f16_fabs_fneg -; GCN-DAG: buffer_load_ushort v[[SA_F16:[0-9]+]] -; GCN-DAG: s_load_dword s[[SB_I32:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|v[[SA_F16]]|, s[[SB_I32]] +; GCN-LABEL: {{^}}class_f16_fabs_fneg: +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; GCN: s_load_dword s[[SB_I32:[0-9]+]] +; GCN: v_mov_b32_e32 [[V_B_I32:v[0-9]+]], s[[SB_I32]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|s[[SA_F16]]|, [[V_B_I32]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -82,8 +85,8 @@ entry: } ; GCN-LABEL: {{^}}class_f16_1: -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[SA_F16]], 1{{$}} +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s[[SA_F16]], 1{{$}} ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -98,8 +101,8 @@ entry: } ; GCN-LABEL: {{^}}class_f16_64 -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] -; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], v[[SA_F16]], 64{{$}} +; GCN: s_load_dword s[[SA_F16:[0-9]+]] +; VI: v_cmp_class_f16_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s[[SA_F16]], 64{{$}} ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]] ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -114,9 +117,9 @@ entry: } ; GCN-LABEL: {{^}}class_f16_full_mask: -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] +; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x3ff{{$}} -; VI: v_cmp_class_f16_e32 vcc, v[[SA_F16]], v[[MASK]] +; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm @@ -130,10 +133,10 @@ entry: ret void } -; GCN-LABEL: {{^}}class_f16_nine_bit_mask -; GCN: buffer_load_ushort v[[SA_F16:[0-9]+]] +; GCN-LABEL: {{^}}class_f16_nine_bit_mask: +; GCN: s_load_dword s[[SA_F16:[0-9]+]] ; VI: v_mov_b32_e32 v[[MASK:[0-9]+]], 0x1ff{{$}} -; VI: v_cmp_class_f16_e32 vcc, v[[SA_F16]], v[[MASK]] +; VI: v_cmp_class_f16_e32 vcc, s[[SA_F16]], v[[MASK]] ; VI: v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc ; GCN: buffer_store_dword v[[VR_I32]] ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll index 4cb9e8c8d64..63ce22078ee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll @@ -50,8 +50,9 @@ main_body: } ; GCN-LABEL: {{^}}image_store_f16 -; GCN: {{flat|global}}_load_ushort v[[LO:[0-9]+]], -; GCN: image_store v[[LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 +; GCN: s_load_dword s[[LO:[0-9]+]], +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] +; GCN: image_store v[[V_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 define amdgpu_kernel void @image_store_f16(half %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: call void @llvm.amdgcn.image.store.f16.v4i32.v8i32(half %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll index 2fb7c4e1270..f9a2281baa6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -4,8 +4,9 @@ ; GCN-LABEL: {{^}}tbuffer_store_d16_x: -; GCN: {{flat|global}}_load_ushort v[[LO:[0-9]+]], -; GCN: tbuffer_store_format_d16_x v[[LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; GCN: s_load_dword s[[S_LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] +; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { main_body: call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 350ecedb80d..413c93d533a 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -18,10 +18,18 @@ ; VI-DAG: s_lshl_b32 ; VI: v_or_b32_e32 -; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 -; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: s_load_dword s +; CI-NEXT: s_load_dword s +; CI-NOT: {{buffer|flat}} +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; CI: s_and_b32 +; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; CI: s_and_b32 +; CI: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16 +; CI: s_lshl_b32 +; CI: v_or_b32_e32 define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index b692b2226c6..b4b1ab82492 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -76,32 +76,25 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 % ; extloads with mubuf instructions. ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8: -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte -; GCN: buffer_load_sbyte +; GCN: s_load_dword s +; GCN: s_load_dword s +; GCN-NOT: _load_ -; SI: v_min_i32 -; SI: v_min_i32 -; SI: v_min_i32 -; SI: v_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 ; GFX9: v_min_i16 ; GFX9: v_min_i16 ; GFX9: v_min_i16 ; GFX9: v_min_i16 -; GCN: s_endpgm - ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT @@ -114,8 +107,15 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 } ; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16: -; SI: v_min_i32 -; SI: v_min_i32 +; GCN: s_load_dword s +; GCN: s_load_dword s + +; SI: s_ashr_i32 +; SI: s_ashr_i32 +; SI: s_sext_i32_i16 +; SI: s_sext_i32_i16 +; SI: s_min_i32 +; SI: s_min_i32 ; VI: s_sext_i32_i16 ; VI: s_sext_i32_i16 @@ -134,10 +134,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, < } ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: -; SI: v_min_i32 -; SI: v_min_i32 -; SI: v_min_i32 -; SI: v_min_i32 +; SI-NOT: buffer_load +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 +; SI: s_min_i32 ; VI: s_min_i32 ; VI: s_min_i32 @@ -453,14 +454,15 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, < } ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 -; SI: v_min_u32 +; GCN-NOT: {{buffer|flat|global}}_load +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 +; SI: s_min_u32 ; VI: s_min_u32 ; VI: s_min_u32 diff --git a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll index f2f845b86db..76508731aca 100644 --- a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll +++ b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll @@ -40,7 +40,10 @@ define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4: ; GCN: s_load_dword s -; GCN: s_load_dwordx2 s +; GCN-NEXT: s_load_dword s +; GCN-NEXT: s_load_dword s +; GCN-NOT: {{buffer|flat|global}} + ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 { %x.bc = bitcast <4 x i16> %x to <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll index 5eaad1f363f..f784e225ded 100644 --- a/llvm/test/CodeGen/AMDGPU/select-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI +; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN -; FUNC-LABEL: {{^}}select_i1: -; SI: v_cndmask_b32 -; SI-NOT: v_cndmask_b32 +; GCN-LABEL: {{^}}select_i1: +; GCN: v_cndmask_b32 +; GCN-NOT: v_cndmask_b32 define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind { %cmp = icmp ugt i32 %cond, 5 %sel = select i1 %cmp, i1 %a, i1 %b @@ -13,12 +13,16 @@ define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 ret void } -; FUNC-LABEL: {{^}}s_minmax_i1: -; SI-DAG: buffer_load_ubyte [[COND:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; SI-DAG: buffer_load_ubyte [[A:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:45 -; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]] -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]] +; GCN-LABEL: {{^}}s_minmax_i1: +; GCN: s_load_dword [[LOAD:s[0-9]+]], +; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8 +; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16 +; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]] +; GCN-DAG: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]] +; GCN-DAG: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] +; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1 +; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]] +; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]] define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { %cmp = icmp slt i1 %cond, false %sel = select i1 %cmp, i1 %a, i1 %b diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll index e74b526070c..70e11d204ad 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -663,10 +663,10 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %ou ; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16: ; GFX9: v_pk_add_u16 -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} -; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} ; GFX9: v_pk_add_u16 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { %c = add <3 x i16> %a, %b ; add to prevent folding into extload @@ -702,11 +702,10 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %ou ; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16: ; GFX9: v_pk_add_u16 -; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} -; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} - ; GFX9: v_pk_add_u16 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { %c = add <3 x i16> %a, %b ; add to prevent folding into extload diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 260aac8d159..083ec9edc06 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -14,14 +14,21 @@ ; VI: s_lshr_b32 ; VI: s_and_b32 ; VI: s_and_b32 -; SI: s_and_B32 -; SI: s_or_b32 +; VI: s_and_b32 +; VI: s_or_b32 + -; CI-DAG: v_lshlrev_b32_e32 -; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CI: v_or_b32_e32 +; CI: s_load_dword s +; CI: s_load_dword s +; CI: s_lshr_b32 +; CI: s_and_b32 +; CI: s_lshr_b32 +; CI: s_lshl_b32 +; CI: s_lshl_b32 +; CI: s_lshl_b32 +; CI: s_and_b32 +; CI: s_or_b32 +; CI: _store_dword define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index c6a3cce8672..76dd0800016 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -18,15 +18,24 @@ ; SI: s_and_b32 ; SI: s_or_b32 -; CI: v_sub_i32_e32 -; CI-DAG: v_sub_i32_e32 -; CI: v_bfe_i32 -; CI-DAG: v_bfe_i32 -; CI-DAG: v_add_i32_e32 -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 -; CI: v_add_i32_e32 -; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, -; CI: v_or_b32_e32 +; CI-NOT: {{buffer|flat}}_load +; CI: s_load_dword s +; CI-NOT: {{buffer|flat}}_load +; CI: s_lshr_b32 +; CI: s_ashr_i32 +; CI: s_sext_i32_i16 +; CI: s_sub_i32 +; CI: s_sub_i32 +; CI: s_sext_i32_i16 +; CI: s_sext_i32_i16 +; CI: s_max_i32 +; CI: s_max_i32 +; CI: s_lshl_b32 +; CI: s_add_i32 +; CI: s_add_i32 +; CI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff +; CI: s_or_b32 + define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { %neg = sub <2 x i16> zeroinitializer, %val %cond = icmp sgt <2 x i16> %val, %neg diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll new file mode 100644 index 00000000000..9a2e4280ffb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -0,0 +1,169 @@ +; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s + +; GCN-LABEL: {{^}}widen_i16_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_addk_i32 [[VAL]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 4 +define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) { + %load = load i16, i16 addrspace(4)* %arg, align 4 + %add = add i16 %load, 999 + %or = or i16 %add, 4 + store i16 %or, i16 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_constant_load_zext_i32: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[VAL]], 0xffff{{$}} +; GCN: s_addk_i32 [[TRUNC]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[TRUNC]], 4 +define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %arg) { + %load = load i16, i16 addrspace(4)* %arg, align 4 + %ext = zext i16 %load to i32 + %add = add i32 %ext, 999 + %or = or i32 %add, 4 + store i32 %or, i32 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_constant_load_sext_i32: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_sext_i32_i16 [[EXT:s[0-9]+]], [[VAL]] +; GCN: s_addk_i32 [[EXT]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[EXT]], 4 +define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %arg) { + %load = load i16, i16 addrspace(4)* %arg, align 4 + %ext = sext i16 %load to i32 + %add = add i32 %ext, 999 + %or = or i32 %add, 4 + store i32 %or, i32 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i17_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 34 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[ADD]], 4 +; GCN: s_bfe_u32 s{{[0-9]+}}, [[OR]], 0x10010 +define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) { + %load = load i17, i17 addrspace(4)* %arg, align 4 + %add = add i17 %load, 34 + %or = or i17 %add, 4 + store i17 %or, i17 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_f16_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[VAL]] +; SI: v_add_f32_e32 [[ADD:v[0-9]+]], 4.0, [[CVT]] + +; VI: v_add_f16_e64 [[ADD:v[0-9]+]], [[VAL]], 4.0 +define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) { + %load = load half, half addrspace(4)* %arg, align 4 + %add = fadd half %load, 4.0 + store half %add, half addrspace(1)* null + ret void +} + +; FIXME: valu usage on VI +; GCN-LABEL: {{^}}widen_v2i8_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] + +; SI: s_add_i32 +; SI: s_or_b32 +; SI: s_addk_i32 +; SI: s_and_b32 +; SI: s_or_b32 +; SI: s_or_b32 + +; VI: s_add_i32 +; VI: v_add_u32_sdwa +; VI: v_or_b32_sdwa +; VI: v_or_b32_e32 +define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) { + %load = load <2 x i8>, <2 x i8> addrspace(4)* %arg, align 4 + %add = add <2 x i8> %load, <i8 12, i8 44> + %or = or <2 x i8> %add, <i8 4, i8 3> + store <2 x i8> %or, <2 x i8> addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}no_widen_i16_constant_divergent_load: +; GCN: {{buffer|flat}}_load_ushort +define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)* %arg) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = zext i32 %tid to i64 + %gep.arg = getelementptr inbounds i16, i16 addrspace(4)* %arg, i64 %tid.ext + %load = load i16, i16 addrspace(4)* %gep.arg, align 4 + %add = add i16 %load, 999 + %or = or i16 %add, 4 + store i16 %or, i16 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i1_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 {{s[0-9]+}}, [[VAL]], 1{{$}} +define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) { + %load = load i1, i1 addrspace(4)* %arg, align 4 + %and = and i1 %load, true + store i1 %and, i1 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_zextload_i64_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 [[TRUNC:s[0-9]+]], [[VAL]], 0xffff{{$}} +; GCN: s_addk_i32 [[TRUNC]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[TRUNC]], 4 +define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)* %arg) { + %load = load i16, i16 addrspace(4)* %arg, align 4 + %zext = zext i16 %load to i32 + %add = add i32 %zext, 999 + %or = or i32 %add, 4 + store i32 %or, i32 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i1_zext_to_i64_constant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 [[AND:s[0-9]+]], [[VAL]], 1 +; GCN: s_add_u32 [[ADD:s[0-9]+]], [[AND]], 0x3e7 +; GCN: s_addc_u32 s{{[0-9]+}}, 0, 0 +define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %arg) { + %load = load i1, i1 addrspace(4)* %arg, align 4 + %zext = zext i1 %load to i64 + %add = add i64 %zext, 999 + store i64 %add, i64 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_constant32_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_addk_i32 [[VAL]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 4 +define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) { + %load = load i16, i16 addrspace(6)* %arg, align 4 + %add = add i16 %load, 999 + %or = or i16 %add, 4 + store i16 %or, i16 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}widen_i16_global_invariant_load: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_addk_i32 [[VAL]], 0x3e7 +; GCN: s_or_b32 [[OR:s[0-9]+]], [[VAL]], 1 +define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %arg) { + %load = load i16, i16 addrspace(1)* %arg, align 4, !invariant.load !0 + %add = add i16 %load, 999 + %or = or i16 %add, 1 + store i16 %or, i16 addrspace(1)* null + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() + +!0 = !{} |