diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-05-22 06:32:10 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-05-22 06:32:10 +0000 |
commit | 1349a04ef5f594dda705ec80474dda4837f26dba (patch) | |
tree | c33cdd2eb97c0a6c41289054e0d700cfc46ac2fa /llvm/test | |
parent | bfd08534b020b0feb420766a8d2b3fb0f295f551 (diff) | |
download | bcm5719-llvm-1349a04ef5f594dda705ec80474dda4837f26dba.tar.gz bcm5719-llvm-1349a04ef5f594dda705ec80474dda4837f26dba.zip |
AMDGPU: Make v2i16/v2f16 legal on VI
This usually results in better code. Fixes using
inline asm with short2, and also fixes having a different
ABI for function parameters between VI and gfx9.
Partially cleans up the mess used for lowering of the d16
operations. Making v4f16 legal will help clean this up more,
but this requires additional work.
llvm-svn: 332953
Diffstat (limited to 'llvm/test')
30 files changed, 657 insertions, 511 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index c0bf7ba70a1..abf703017b3 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -1,12 +1,14 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_v2i16: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; FIXME: or should be unnecessary ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32 define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -52,21 +54,26 @@ define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, < ; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; VI: v_add_u32 -; VI: v_add_u32_sdwa +; VI: s_add_i32 +; VI: s_add_i32 +; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_and_b32 +; VI: s_or_b32 define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = add <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void } +; FIXME: Eliminate or with sdwa ; GCN-LABEL: {{^}}v_test_add_v2i16_constant: ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8 -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -84,7 +91,7 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}} ; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21 -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -99,10 +106,9 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}} ; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1 -; VI: flat_load_ushort [[LOAD0:v[0-9]+]] -; VI: flat_load_ushort [[LOAD1:v[0-9]+]] -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]] +; VI: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]] ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -117,10 +123,11 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} +; VI: flat_load_dword ; VI-NOT: v_add_u16 +; VI: v_and_b32_e32 v{{[0-9]+}}, 0xffff0000, ; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}} ; VI-NOT: v_add_u16 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -139,9 +146,9 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac ; VI-NOT: v_add_u16 ; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80 -; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NOT: v_add_u16 -; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -162,15 +169,13 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace( ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_dword v[[A:[0-9]+]] +; VI: flat_load_dword v[[B:[0-9]+]] -; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and ; VI-NOT: shl -; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] +; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]] +; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NOT: and ; VI-NOT: shl ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} @@ -198,13 +203,11 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9: buffer_store_dwordx4 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_dword v[[A:[0-9]+]] +; VI: flat_load_dword v[[B:[0-9]+]] ; VI-DAG: v_add_u16_e32 -; VI-DAG: v_add_u16_e32 +; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { @@ -230,8 +233,9 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} +; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: v_add_u16_e32 -; VI: v_add_u16_e32 + ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 ; VI: buffer_store_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll index f7b25a6fc65..6d3878c8ab2 100644 --- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -8,8 +8,17 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_load_dword [[LHS:s[0-9]+]] +; VI: s_load_dword [[RHS:s[0-9]+]] +; VI: s_ashr_i32 +; VI: s_ashr_i32 +; VI: s_sext_i32_i16 +; VI: s_sext_i32_i16 +; VI: s_ashr_i32 +; VI: s_ashr_i32 +; VI: s_lshl_b32 +; VI: s_and_b32 +; VI: s_or_b32 ; CI-DAG: v_ashrrev_i32_e32 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index 817430ac434..9c8f04fdfd9 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -71,10 +71,15 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x } ; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_store_short -; SICIVI: buffer_store_short +; SICI: buffer_load_ushort +; SICI: buffer_load_ushort +; SICI: buffer_store_short +; SICI: buffer_store_short + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: buffer_store_short +; VI: buffer_store_short ; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c ; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30 @@ -92,9 +97,16 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16: -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort -; SICIVI: buffer_load_ushort +; SICI: buffer_load_ushort +; SICI: buffer_load_ushort +; SICI: buffer_load_ushort + +; SICI: buffer_store_short +; SICI: buffer_store_short +; SICI: buffer_store_short + +; SICI: buffer_load_ushort +; SICI: buffer_store_short ; GFX9-DAG: global_load_short_d16_hi v ; GFX9-DAG: global_load_short_d16 v diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index dafabdff39a..d3e4afc8e83 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; DAGCombiner will transform: ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) @@ -36,16 +36,8 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) { ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; CI: v_or_b32_e32 -; VI: flat_load_ushort [[HI:v[0-9]+]] -; VI: flat_load_ushort [[LO:v[0-9]+]] -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[HI]], [[MASK]] -; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[LO]], [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_LO]], [[FABS_HI]] -; VI: flat_store_dword - -; GFX9: s_load_dword [[VAL:s[0-9]+]] -; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff +; GFX89: s_load_dword [[VAL:s[0-9]+]] +; GFX89: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) store <2 x half> %fabs, <2 x half> addrspace(1)* %out @@ -59,13 +51,12 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} -; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX89: s_load_dword s +; GFX89: s_load_dword s +; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff +; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; GFX89: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { @@ -147,9 +138,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; CI-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] +; GFX89-DAG: v_mul_f16_e64 v{{[0-9]+}}, |[[VAL]]|, 4.0 ; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-DAG: v_add_f16_sdwa v{{[0-9]+}}, |[[VAL]]|, [[CONST2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid @@ -167,11 +158,12 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(<2 x half> addrspace(1)* %i ; GCN-LABEL: {{^}}v_extract_fabs_no_fold_v2f16: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] - -; FIXME: Extra bfe on VI -; GFX9-NOT: v_bfe_u32 -; VI: v_bfe_u32 ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 0x7fff7fff, [[VAL]] + + +; VI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 15 +; VI: flat_store_short + ; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[AND]], off define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index be8d52fa958..7cc556ce168 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -222,12 +222,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1) ret void } -; FIXME: Fold modifier ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16: -; VI-DAG: v_bfe_u32 -; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}} -; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_max_f16_e64 [[REG1:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| ; VI-NOT: 0xffff ; VI: v_or_b32 @@ -245,9 +242,8 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16: -; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} -; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| ; VI: v_or_b32 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} @@ -265,9 +261,8 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16: -; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} -; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]] +; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e64 [[REG0:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} ; VI-NOT: 0xffff ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 937bd74a0fe..39455acad48 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -94,12 +94,13 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half ad ; SI-NEXT: v_max3_f32 ; SI-NEXT: v_max3_f32 -; VI: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 -; VI-NEXT: v_max_f16_e32 +; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_max_f16_e32 v0, v0, v1 +; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_max_f16_e32 v0, v2, v0 +; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_max_f16_e32 v0, v0, v3 +; VI: v_or_b32_e32 v0, v0, v1 ; GFX9: v_pk_max_f16 ; GFX9-NEXT: v_pk_max_f16 diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index d22333384dc..06befaa64b5 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -92,12 +92,13 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half ad ; SI-NEXT: v_min3_f32 ; SI-NEXT: v_min3_f32 -; VI: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 -; VI-NEXT: v_min_f16_e32 +; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_min_f16_e32 v0, v0, v1 +; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_min_f16_e32 v0, v2, v0 +; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_min_f16_e32 v0, v0, v3 +; VI: v_or_b32_e32 v0, v0, v1 ; GFX9: v_pk_min_f16 ; GFX9: v_pk_min_f16 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index b43271c1bd0..a4722876d3f 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -73,12 +73,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspa ; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}} ; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]] ; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]] -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; CIVI: flat_store_dword +; FIXME: Random commute +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}} define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) @@ -95,14 +92,13 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x ; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]] ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]] ; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]] -; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], - -; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 + +; FIXME: Random commute +; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 + +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] + ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} @@ -120,7 +116,7 @@ define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x h ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} ; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0 +; VI: v_mul_f16_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|, 4.0 ; VI: v_mul_f16_sdwa v{{[0-9]+}}, -|v{{[0-9]+}}|, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index 5649ddfc6e3..b4f8bb98cd7 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -60,7 +60,8 @@ define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspa ret void } -; FIXME: Terrible code with VI and even worse with SI/CI +; FIXME: Terrible code with SI/CI. +; FIXME: scalar for VI, vector for gfx9 ; GCN-LABEL: {{^}}s_fneg_v2f16: ; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} @@ -68,12 +69,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspa ; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; CI: v_or_b32_e32 -; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}} -; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] +; VI: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} - define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in store <2 x half> %fneg, <2 x half> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 0df58519a62..a042700edf8 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; half args should be promoted to float for SI and lower. @@ -13,13 +13,17 @@ define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { ret void } +; FIXME: Should always be the same ; GCN-LABEL: {{^}}load_v2f16_arg: -; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 -; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] -; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] -; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN: s_endpgm +; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 +; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] +; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] +; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} + +; VI: s_load_dword [[ARG:s[0-9]+]] +; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]] +; VI: buffer_store_dword [[V_ARG]] define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { store <2 x half> %arg, <2 x half> addrspace(1)* %out ret void @@ -40,12 +44,18 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha } ; GCN-LABEL: {{^}}load_v4f16_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_dwordx2 -; GCN: s_endpgm +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_store_dwordx2 + +; FIXME: Why not one load? +; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 +; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]] +; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]] +; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}} define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out ret void @@ -104,14 +114,20 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -145,8 +161,12 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, hal } ; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v +; SI-DAG: buffer_load_ushort v +; SI-DAG: buffer_load_ushort v + +; VI-DAG: s_load_dword s +; VI: s_lshr_b32 + ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f64_f32_e32 @@ -176,10 +196,14 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v + +; VI: s_load_dword s +; VI: s_load_dword s + ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 @@ -196,15 +220,23 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v + +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v +; SI: buffer_load_ushort v + + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s + -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v -; GCN-DAG: buffer_load_ushort v ; GCN-DAG: v_cvt_f32_f16_e32 ; GCN-DAG: v_cvt_f32_f16_e32 diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index 4844060feba..3227633496a 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s ; FIXME: Merge into imm.ll ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: @@ -120,11 +120,14 @@ define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -138,11 +141,14 @@ define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 0.5 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -156,11 +162,14 @@ define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -0.5 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -174,11 +183,14 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -192,11 +204,15 @@ define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0xbc00 +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -1.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -210,11 +226,14 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -228,11 +247,14 @@ define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -2.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -246,11 +268,14 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 4.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -264,11 +289,14 @@ define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %ou ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONSTM4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], -4.0 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -321,11 +349,14 @@ define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %o ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1{{$}} +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 1{{$}} ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -339,11 +370,15 @@ define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2{{$}} +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 2{{$}} ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -357,11 +392,15 @@ define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16 op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]] -; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] +; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16{{$}} +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 16{{$}} ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -375,10 +414,9 @@ define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI: v_or_b32_e32 [[REG:v[0-9]+]] -; VI: v_add_u32_e32 [[REG]], vcc, -1, [[REG]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], -1{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] ; VI: buffer_store_dword [[REG]] define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %xbc = bitcast <2 x half> %x to i32 @@ -393,10 +431,9 @@ define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* % ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI: v_or_b32_e32 [[REG:v[0-9]+]] -; VI: v_add_u32_e32 [[REG]], vcc, 0xfffefffe, [[REG]] +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfffefffe{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] ; VI: buffer_store_dword [[REG]] define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %xbc = bitcast <2 x half> %x to i32 @@ -411,10 +448,10 @@ define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* % ; GFX9: v_mov_b32_e32 [[REG:v[0-9]+]], [[VAL]] ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI: v_or_b32_e32 [[REG:v[0-9]+]] -; VI: v_add_u32_e32 [[REG]], vcc, 0xfff0fff0, [[REG]] + +; VI: s_load_dword [[VAL:s[0-9]+]] +; VI: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], 0xfff0fff0{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]] ; VI: buffer_store_dword [[REG]] define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { %xbc = bitcast <2 x half> %x to i32 @@ -429,11 +466,14 @@ define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63 ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST63]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 63 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -447,11 +487,14 @@ define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64 ; GFX9: buffer_store_dword [[REG]] -; VI: buffer_load_ushort [[VAL0:v[0-9]+]] -; VI: buffer_load_ushort [[VAL1:v[0-9]+]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]] +; FIXME: Shouldn't need right shift and SDWA, also extra copy +; VI-DAG: s_load_dword [[VAL:s[0-9]+]] ; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64 -; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[VAL1]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VAL]], 16 +; VI-DAG: v_mov_b32_e32 [[V_SHR:v[0-9]+]], [[SHR]] + +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[V_SHR]], [[CONST64]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_add_f16_e64 v{{[0-9]+}}, [[VAL]], 64 ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll index c1d67ba614c..ae736f53378 100644 --- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll @@ -42,16 +42,19 @@ define amdgpu_kernel void @s_input_output_f16() { ret void } -; GCN: error: couldn't allocate output register for constraint 's' -; GCN: error: couldn't allocate input reg for constraint 's' +; CI: error: couldn't allocate output register for constraint 's' +; CI: error: couldn't allocate input reg for constraint 's' + +; VI-NOT: error define amdgpu_kernel void @s_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x half> %v) ret void } -; GCN: error: couldn't allocate output register for constraint 'v' -; GCN: error: couldn't allocate input reg for constraint 'v' +; CI: error: couldn't allocate output register for constraint 'v' +; CI: error: couldn't allocate input reg for constraint 'v' +; VI-NOT: error define amdgpu_kernel void @v_input_output_v2f16() { %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"() tail call void asm sideeffect "; use $0", "v"(<2 x half> %v) @@ -67,8 +70,12 @@ define amdgpu_kernel void @s_input_output_i16() { ret void } -; GCN: error: couldn't allocate output register for constraint 's' -; GCN: error: couldn't allocate input reg for constraint 's' +; FIXME: Should work on all targets? + +; CI: error: couldn't allocate output register for constraint 's' +; CI: error: couldn't allocate input reg for constraint 's' + +; VI-NOT: error define amdgpu_kernel void @s_input_output_v2i16() { %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"() tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 337cac2fc22..4f075f50258 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-NO-TONGA %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-TONGA %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s ; FIXME: Broken on evergreen ; FIXME: For some reason the 8 and 16 vectors are being stored as diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 8bf55a4544c..cb2753bcc08 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s ; GCN-LABEL: {{^}}s_insertelement_v2i16_0: ; GCN: s_load_dword [[VEC:s[0-9]+]] @@ -39,11 +39,21 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* % ; GCN: s_load_dword [[ELT0:s[0-9]+]] ; GCN: s_load_dword [[VEC:s[0-9]+]] -; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} -; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 -; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16 -; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] -; CIVI-DAG: ; use [[SHR]] +; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16 +; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; CI-DAG: ; use [[SHR]] + + +; FIXME: Should be able to void mask of upper bits +; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}} +; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]] +; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 + +; VI-DAG: ; use [[SHR]] + ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]] @@ -103,10 +113,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] ; GCN: s_load_dword [[VEC:s[0-9]+]], -; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 -; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 -; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16 -; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] +; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; CI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16 +; CI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] + + +; VI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; VI-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16 +; VI: s_and_b32 [[MASK_HI:s[0-9]+]], [[VEC]], 0xffff0000 +; VI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[MASK_HI]] ; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 ; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index c307df8a444..8e9abb9de8b 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -1,8 +1,8 @@ -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC -; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s +; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s ; FUNC-LABEL: {{^}}i8_arg: ; HSA-VI: kernarg_segment_alignment = 4 @@ -162,10 +162,11 @@ entry: ; HSA-VI: kernarg_segment_alignment = 4 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(1)* %out @@ -285,14 +286,14 @@ entry: ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-GCN: flat_load_ushort -; HSA-GCN: flat_load_ushort -; HSA-GCN: flat_load_ushort -; HSA-GCN: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s +; VI: s_load_dword s define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { entry: store <4 x i16> %in, <4 x i16> addrspace(1)* %out @@ -305,6 +306,7 @@ entry: ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X + ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 @@ -370,22 +372,20 @@ entry: ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) { entry: store <8 x i16> %in, <8 x i16> addrspace(1)* %out @@ -502,38 +502,32 @@ entry: ; EG: VTX_READ_16 ; EG: VTX_READ_16 ; EG: VTX_READ_16 -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; MESA-GCN: buffer_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort -; HSA-VI: flat_load_ushort + +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort +; SI: buffer_load_ushort + +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_load_dword s define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) { entry: store <16 x i16> %in, <16 x i16> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll index 76dd2fe6e53..2d793c0bd84 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s @@ -13,9 +13,12 @@ main_body: ; GCN-LABEL: {{^}}buffer_store_format_d16_xy: -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen ; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { @@ -26,17 +29,27 @@ main_body: ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; +; UNPACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; UNPACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] + ; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] + +; PACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; PACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38 + +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]] ; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll index acc7f14f5fa..b14430e4659 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=UNPACKED %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s -; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}image_load_f16 ; GCN: image_load v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 unorm d16 @@ -58,11 +58,17 @@ main_body: ret void } -; GCN-LABEL: {{^}}image_store_v2f16 +; FIXME: Eliminate and to get low bits +; GCN-LABEL: {{^}}image_store_v2f16: +; UNPACKED: s_load_dword [[DATA:s[0-9]+]] +; UNPACKED-DAG: s_lshr_b32 [[UNPACK_1:s[0-9]+]], [[DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[UNPACK_0:s[0-9]+]], [[DATA]], 0xffff +; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_0:[0-9]+]], [[UNPACK_0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_1:[0-9]+]], [[UNPACK_1]] -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 + + +; UNPACKED: image_store v{{\[}}[[V_UNPACK_0]]:[[V_UNPACK_1]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 ; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16 define amdgpu_kernel void @image_store_v2f16(<2 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { @@ -72,20 +78,19 @@ main_body: } ; GCN-LABEL: {{^}}image_store_v4f16 - -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 - -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] - -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] - -; PACKED: image_store v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; UNPACKED: s_load_dword s +; UNPACKED: s_load_dword s +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_and_b32 +; UNPACKED: s_and_b32 +; UNPACKED: image_store v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 + +; PACKED: s_load_dword [[DATA0:s[0-9]+]] +; PACKED: s_load_dword [[DATA1:s[0-9]+]] +; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]] +; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]] +; PACKED: image_store v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: call void @llvm.amdgcn.image.store.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) @@ -93,20 +98,19 @@ main_body: } ; GCN-LABEL: {{^}}image_store_mip_v4f16 - -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}]{{$}} -; UNPACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 - -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] - -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] - -; PACKED: image_store_mip v{{\[}}[[LO]]:[[HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 +; UNPACKD: s_load_dword s +; UNPACKD: s_load_dword s +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; UNPACKED: s_and_b32 +; UNPACKED: s_and_b32 +; UNPACKED: image_store_mip v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 + +; PACKED: s_load_dword [[DATA0:s[0-9]+]] +; PACKED: s_load_dword [[DATA1:s[0-9]+]] +; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]] +; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]] +; PACKED: image_store_mip v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16 define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) { main_body: call void @llvm.amdgcn.image.store.mip.v4f16.v4i32.v8i32(<4 x half> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index 8234e2c3993..671a5a6f05a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}load_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll index bbe21ea34a7..517c0a90650 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s @@ -12,12 +12,13 @@ main_body: ret void } - ; GCN-LABEL: {{^}}tbuffer_store_d16_xy: - -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen ; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { @@ -26,21 +27,23 @@ main_body: ret void } - ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: +; GCN-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34 +; GCN-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x38 -; UNPACKED: flat_load_ushort v[[HI:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: flat_load_ushort v[[LO:[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; -; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]] -; GFX81: v_or_b32_e32 v[[HI:[0-9]+]] -; GFX81: v_or_b32_e32 v[[LO:[0-9]+]] +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen -; GFX9: v_mov_b32_e32 v[[LO:[0-9]+]] -; GFX9: v_mov_b32_e32 v[[HI:[0-9]+]] +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]] ; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll index bfcce66ac1d..f59741426ba 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -145,8 +145,12 @@ define amdgpu_kernel void @fma_v2f16( } ; GCN-LABEL: {{^}}fma_v2f16_imm_a: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] + +; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] + ; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} @@ -185,8 +189,8 @@ define amdgpu_kernel void @fma_v2f16_imm_a( ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]] ; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} @@ -228,8 +232,8 @@ define amdgpu_kernel void @fma_v2f16_imm_b( ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}} ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 931303a7c9a..350ecedb80d 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s ; GCN-LABEL: {{^}}s_lshr_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] @@ -8,11 +8,20 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +; VI: s_load_dword [[LHS:s[0-9]+]] +; VI: s_load_dword [[RHS:s[0-9]+]] +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16 +; VI-DAG: s_lshl_b32 +; VI: v_or_b32_e32 + ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} -; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 -; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 0c127613580..b692b2226c6 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -117,8 +117,10 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 ; SI: v_min_i32 ; SI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 +; VI: s_sext_i32_i16 +; VI: s_sext_i32_i16 +; VI: s_min_i32 +; VI: s_min_i32 ; GFX9: v_pk_min_i16 @@ -131,17 +133,16 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, < ret void } -; FIXME: VI use s_min_i32 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 -; VI: v_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 +; VI: s_min_i32 ; GFX9: v_pk_min_i16 ; GFX9: v_pk_min_i16 @@ -461,14 +462,14 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, < ; SI: v_min_u32 ; SI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 -; VI: v_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 +; VI: s_min_u32 ; EG: MIN_UINT ; EG: MIN_UINT diff --git a/llvm/test/CodeGen/AMDGPU/reduction.ll b/llvm/test/CodeGen/AMDGPU/reduction.ll index 445526ec89d..621d83b731e 100644 --- a/llvm/test/CodeGen/AMDGPU/reduction.ll +++ b/llvm/test/CodeGen/AMDGPU/reduction.ll @@ -5,7 +5,7 @@ ; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_f16_e32 +; VI: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 define half @reduction_half4(<4 x half> %vec4) { @@ -22,7 +22,7 @@ entry: ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_u16_e32 +; VI: v_add_u16_sdwa ; VI-NEXT: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 define i16 @reduction_v4i16(<4 x i16> %vec4) { @@ -41,8 +41,8 @@ entry: ; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 +; VI: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 @@ -67,8 +67,8 @@ entry: ; GFX9-NEXT: v_pk_add_u16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_u16_e32 -; VI-NEXT: v_add_u16_e32 +; VI: v_add_u16_sdwa +; VI-NEXT: v_add_u16_sdwa ; VI-NEXT: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 ; VI-NEXT: v_add_u16_e32 @@ -97,10 +97,10 @@ entry: ; GFX9-NEXT: v_pk_add_f16 [[ADD3]], [[ADD2]], [[ADD1]]{{$}} ; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 -; VI-NEXT: v_add_f16_e32 +; VI: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa +; VI-NEXT: v_add_f16_sdwa ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 ; VI-NEXT: v_add_f16_e32 @@ -131,7 +131,7 @@ entry: ; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_u16_e32 +; VI: v_min_u16_sdwa ; VI-NEXT: v_min_u16_e32 ; VI-NEXT: v_min_u16_e32 define i16 @reduction_min_v4i16(<4 x i16> %vec4) { @@ -152,8 +152,8 @@ entry: ; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}} ; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_u16_e32 -; VI-NEXT: v_min_u16_e32 +; VI: v_min_u16_sdwa +; VI-NEXT: v_min_u16_sdwa ; VI-NEXT: v_min_u16_e32 ; VI-NEXT: v_min_u16_e32 ; VI-NEXT: v_min_u16_e32 @@ -224,10 +224,10 @@ entry: ; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_i16_e32 -; VI-NEXT: v_min_i16_e32 -; VI-NEXT: v_min_i16_e32 -; VI-NEXT: v_min_i16_e32 +; VI: v_min_i16_sdwa +; VI-NEXT: v_min_i16_sdwa +; VI-NEXT: v_min_i16_sdwa +; VI-NEXT: v_min_i16_sdwa ; VI-NEXT: v_min_i16_e32 ; VI-NEXT: v_min_i16_e32 ; VI-NEXT: v_min_i16_e32 @@ -339,7 +339,7 @@ entry: ; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_u16_e32 +; VI: v_max_u16_sdwa ; VI-NEXT: v_max_u16_e32 ; VI-NEXT: v_max_u16_e32 define i16 @reduction_umax_v4i16(<4 x i16> %vec4) { @@ -358,7 +358,7 @@ entry: ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_i16_e32 +; VI: v_max_i16_sdwa ; VI-NEXT: v_max_i16_e32 ; VI-NEXT: v_max_i16_e32 define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 { @@ -377,7 +377,7 @@ entry: ; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_max_f16_e32 +; VI: v_max_f16_sdwa ; VI-NEXT: v_max_f16_e32 ; VI-NEXT: v_max_f16_e32 define half @reduction_fmax_v4half(<4 x half> %vec4) { @@ -396,7 +396,7 @@ entry: ; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI: v_min_f16_e32 +; VI: v_min_f16_sdwa ; VI-NEXT: v_min_f16_e32 ; VI-NEXT: v_min_f16_e32 define half @reduction_fmin_v4half(<4 x half> %vec4) { @@ -409,4 +409,4 @@ entry: %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 ret half %res -}
\ No newline at end of file +} diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index 47e6545d0a0..3c92e8e5cba 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDWA,GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll index e5c8191b24b..1e34036e6bd 100644 --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; Test expansion of scalar selects on vectors. ; Evergreen not enabled since it seems to be having problems with doubles. @@ -76,8 +76,14 @@ define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, } ; GCN-LABEL: {{^}}select_v2i16: -; GCN: v_cndmask_b32_e32 -; GCN-NOT: v_cndmask_b32 +; GFX89: s_load_dword +; GFX89: s_load_dword +; GFX89: s_load_dword +; GFX89: v_cndmask_b32 +; GFX89-NOT: v_cndmask_b32 + +; SI: v_cndmask_b32_e32 +; SI-NOT: v_cndmask_b32e define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b @@ -86,7 +92,9 @@ define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> } ; GCN-LABEL: {{^}}v_select_v2i16: -; GCN: v_cndmask_b32_e32 +; GCN: buffer_load_dword v +; GCN: buffer_load_dword v +; GCN: v_cndmask_b32 ; GCN-NOT: cndmask define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr @@ -330,7 +338,7 @@ define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x do } ; GCN-LABEL: {{^}}v_select_v2f16: -; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32 ; GCN-NOT: cndmask define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index c3e71e27d2b..260aac8d159 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s ; GCN-LABEL: {{^}}s_shl_v2i16: ; GFX9: s_load_dword [[LHS:s[0-9]+]] @@ -8,9 +8,14 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: v_lshlrev_b32_e32 -; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_load_dword s +; VI: s_load_dword s +; VI: s_lshr_b32 +; VI: s_lshr_b32 +; VI: s_and_b32 +; VI: s_and_b32 +; SI: s_and_B32 +; SI: s_or_b32 ; CI-DAG: v_lshlrev_b32_e32 ; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index c80945f390b..429493c85fb 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}s_sext_i1_to_i32: ; GCN: v_cndmask_b32_e64 @@ -177,10 +177,15 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr ret void } -; FIXME: s_bfe_i64 +; FIXME: s_bfe_i64, same on SI and VI ; GCN-LABEL: {{^}}s_sext_v4i16_to_v4i32: -; GCN-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48 -; GCN-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; SI-DAG: s_ashr_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 48 +; SI-DAG: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 + +; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_ashr_i32 s{{[0-9]+}}, s{{[0-9]+}}, 16 + + ; GCN-DAG: s_sext_i32_i16 ; GCN-DAG: s_sext_i32_i16 ; GCN: s_endpgm @@ -199,8 +204,6 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) } ; GCN-LABEL: {{^}}v_sext_v4i16_to_v4i32: -; SI-DAG: v_ashr_i64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, 48 -; VI-DAG: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 48, v{{\[[0-9]+:[0-9]+\]}} ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; GCN-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 44f3cb19dc9..eb02084d8eb 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI,CIVI,GCN %s ; GCN-LABEL: {{^}}s_abs_v2i16: ; GFX9: s_load_dword [[VAL:s[0-9]+]] @@ -8,13 +8,15 @@ ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 -; VI: v_sub_u32_e32 -; VI-DAG: v_sub_u32_e32 -; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; VI: v_add_u32_e32 -; VI: v_add_u32_e32 -; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; VI: s_sub_i32 +; VI: s_sub_i32 +; VI: s_max_i32 +; VI: s_max_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_and_b32 +; SI: s_or_b32 ; CI: v_sub_i32_e32 ; CI-DAG: v_sub_i32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index a608ef715c5..cee8d3eb615 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -1,12 +1,15 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX89,GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,GCN %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_v2i16: +; GFX89: {{flat|global}}_load_dword +; GFX89: {{flat|global}}_load_dword + ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -47,10 +50,15 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, < ; FIXME: VI should not scalarize arg access. ; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg: +; GCN: s_load_dword s +; GCN: s_load_dword s + ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} -; VI: v_subrev_u32_e32 -; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: s_sub_i32 +; VI: s_sub_i32 +; VI: s_lshl_b32 +; VI: s_and_b32 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { %add = sub <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -58,12 +66,15 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out } ; GCN-LABEL: {{^}}v_test_sub_v2i16_constant: -; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} +; GFX89-DAG: {{flat|global}}_load_dword + +; GFX9-DAG: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38 -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}} +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_or_b32 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -95,11 +106,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1: ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1 op_sel_hi:[1,0]{{$}} -; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; VI: flat_load_ushort [[LOAD0:v[0-9]+]] -; VI: flat_load_ushort [[LOAD1:v[0-9]+]] -; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD0]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]] +; VI-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], [[ONE]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD]] ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -114,11 +124,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, 32{{$}} -; VI-NOT: v_subrev_i16 -; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}} -; VI-NOT: v_subrev_i16 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, -; VI: v_or_b32_e32 +; VI: flat_load_dword [[LOAD:v[0-9]+]] +; VI-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, [[LOAD]] +; VI-DAG: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffffe0, [[LOAD]] +; VI: v_or_b32_e32 v{{[0-9]+}}, [[ADD]], [[AND]] define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -136,9 +145,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac ; VI-NOT: v_subrev_i16 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080 -; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: flat_load_dword +; VI: v_add_u16_sdwa [[ADD:v[0-9]+]], v{{[0-9]+}}, [[K]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NOT: v_subrev_i16 -; VI: v_or_b32_e32 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -159,19 +169,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace( ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_dword v[[A:[0-9]+]] +; VI: flat_load_dword v[[B:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] - -; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] -; VI-NOT: and -; VI-NOT: shl -; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A_LO]], v[[B_LO]] -; VI-NOT: and -; VI-NOT: shl -; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} +; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], v[[A]], v[[B]] +; VI-NEXT: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid @@ -196,14 +199,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx4 -; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[A_HI:[0-9]+]] -; VI: flat_load_ushort v[[B_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] - -; VI: v_sub_u16_e32 -; VI: v_sub_u16_e32 - +; VI: flat_load_dword [[A:v[0-9]+]] +; VI: flat_load_dword [[B:v[0-9]+]] +; VI: v_sub_u16_e32 v[[ADD_LO:[0-9]+]], [[A]], [[B]] +; VI: v_sub_u16_sdwa v[[ADD_HI:[0-9]+]], [[A]], [[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -228,8 +227,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: v_sub_u16_e32 -; VI: v_sub_u16_e32 +; VI: flat_load_dword +; VI: flat_load_dword +; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + ; VI: buffer_store_dwordx2 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll index 53e306270ac..14e2cde0e29 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -1,18 +1,15 @@ -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s ; FIXME: Should still like to vectorize the memory operations for VI ; Simple 3-pair chain with loads and stores ; GCN-LABEL: @test1_as_3_3_3_v2f16( -; GFX9: load <2 x half>, <2 x half> addrspace(3)* -; GFX9: load <2 x half>, <2 x half> addrspace(3)* -; GFX9: fmul <2 x half> -; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % -; GFX9: ret - -; VI: load half -; VI: load half +; GFX89: load <2 x half>, <2 x half> addrspace(3)* +; GFX89: load <2 x half>, <2 x half> addrspace(3)* +; GFX89: fmul <2 x half> +; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % +; GFX89: ret define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) { %i0 = load half, half addrspace(3)* %a, align 2 %i1 = load half, half addrspace(3)* %b, align 2 @@ -29,14 +26,11 @@ define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addr } ; GCN-LABEL: @test1_as_3_0_0( -; GFX9: load <2 x half>, <2 x half> addrspace(3)* -; GFX9: load <2 x half>, <2 x half>* -; GFX9: fmul <2 x half> -; GFX9: store <2 x half> %{{.*}}, <2 x half>* % -; GFX9: ret - -; VI: load half -; VI: load half +; GFX89: load <2 x half>, <2 x half> addrspace(3)* +; GFX89: load <2 x half>, <2 x half>* +; GFX89: fmul <2 x half> +; GFX89: store <2 x half> %{{.*}}, <2 x half>* % +; GFX89: ret define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) { %i0 = load half, half addrspace(3)* %a, align 2 %i1 = load half, half* %b, align 2 @@ -53,14 +47,11 @@ define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* } ; GCN-LABEL: @test1_as_0_0_3_v2f16( -; GFX9: load <2 x half>, <2 x half>* -; GFX9: load <2 x half>, <2 x half>* -; GFX9: fmul <2 x half> -; GFX9: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % -; GFX9: ret - -; VI: load half -; VI: load half +; GFX89: load <2 x half>, <2 x half>* +; GFX89: load <2 x half>, <2 x half>* +; GFX89: fmul <2 x half> +; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* % +; GFX89: ret define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) { %i0 = load half, half* %a, align 2 %i1 = load half, half* %b, align 2 |