summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2019-07-27 14:32:23 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2019-07-27 14:32:23 +0000
commit062cd8bb1de9554fc3688d4028b9ef2e09ca7696 (patch)
tree185f349478f8ce8fc8be65b6a9b00f97c65ceb13 /llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
parent603f94aa2a1b87b3e1c5722a3ba270147111e70d (diff)
downloadbcm5719-llvm-062cd8bb1de9554fc3688d4028b9ef2e09ca7696.tar.gz
bcm5719-llvm-062cd8bb1de9554fc3688d4028b9ef2e09ca7696.zip
[AMDGPU] Regenerate tests.
To help show the diffs from an upcoming SimplifyDemandedBits patch. llvm-svn: 367175
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll201
1 files changed, 174 insertions, 27 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index ff634c609e8..73d98ab5b4c 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,15 +1,50 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI
; XXX - Why the packing?
-; GCN-LABEL: {{^}}scalar_to_vector_v2i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
-; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
-; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHR]], [[SHL]]
-; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+; SI-LABEL: scalar_to_vector_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v1, v0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_to_vector_v2i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
%tmp1 = load i32, i32 addrspace(1)* %in, align 4
%bc = bitcast i32 %tmp1 to <2 x i16>
%tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -17,11 +52,48 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out,
ret void
}
-; GCN-LABEL: {{^}}scalar_to_vector_v2f32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; GCN: buffer_store_dwordx2
define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+; SI-LABEL: scalar_to_vector_v2f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v1, v0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_to_vector_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
%tmp1 = load float, float addrspace(1)* %in, align 4
%bc = bitcast float %tmp1 to <2 x i16>
%tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -29,14 +101,40 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out,
ret void
}
-; GCN-LABEL: {{^}}scalar_to_vector_v4i16:
-; VI: v_lshlrev_b16_e32
-; VI: v_lshlrev_b16_e32
-; VI: v_or_b32_e32
-; VI: v_lshlrev_b32
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
define amdgpu_kernel void @scalar_to_vector_v4i16() {
+; SI-LABEL: scalar_to_vector_v4i16:
+; SI: ; %bb.0: ; %bb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_to_vector_v4i16:
+; VI: ; %bb.0: ; %bb
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
+; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
bb:
%tmp = load <2 x i8>, <2 x i8> addrspace(1)* undef, align 1
%tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -45,14 +143,40 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}scalar_to_vector_v4f16:
-; VI: v_lshlrev_b16_e32
-; VI: v_lshlrev_b16_e32
-; VI: v_or_b32_e32
-; VI: v_lshlrev_b32
-; VI: v_or_b32_sdwa
-; VI: v_or_b32_sdwa
define amdgpu_kernel void @scalar_to_vector_v4f16() {
+; SI-LABEL: scalar_to_vector_v4f16:
+; SI: ; %bb.0: ; %bb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_to_vector_v4f16:
+; VI: ; %bb.0: ; %bb
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
+; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
+; VI-NEXT: v_or_b32_e32 v1, v1, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
bb:
%load = load half, half addrspace(1)* undef, align 1
%tmp = bitcast half %load to <2 x i8>
@@ -101,6 +225,29 @@ bb:
; }
define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind {
+; SI-LABEL: scalar_to_vector_test6:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_and_b32 s0, s2, 0xff
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: scalar_to_vector_test6:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_and_b32 s0, s0, 0xff
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
%newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
%bc = bitcast <4 x i8> %newvec0 to <2 x half>
store <2 x half> %bc, <2 x half> addrspace(1)* %out
OpenPOWER on IntegriCloud