diff options
author | Tim Renouf <tpr.llvm@botech.co.uk> | 2018-08-21 11:06:05 +0000 |
---|---|---|
committer | Tim Renouf <tpr.llvm@botech.co.uk> | 2018-08-21 11:06:05 +0000 |
commit | 35484c9d5024f2273707e2a0b4fb7d1767c69dda (patch) | |
tree | 2f2ff8b74ded88f4586c6bc3c6479f3d34584c21 /llvm/test | |
parent | d378a39603566404cf098329b368a6fe390016a9 (diff) | |
download | bcm5719-llvm-35484c9d5024f2273707e2a0b4fb7d1767c69dda.tar.gz bcm5719-llvm-35484c9d5024f2273707e2a0b4fb7d1767c69dda.zip |
[AMDGPU] New tbuffer intrinsics
Summary:
This commit adds new intrinsics
llvm.amdgcn.raw.tbuffer.load
llvm.amdgcn.struct.tbuffer.load
llvm.amdgcn.raw.tbuffer.store
llvm.amdgcn.struct.tbuffer.store
with the following changes from the llvm.amdgcn.tbuffer.* intrinsics:
* there are separate raw and struct versions: raw does not have an index
arg and sets idxen=0 in the instruction, and struct always sets
idxen=1 in the instruction even if the index is 0, to allow for the
fact that gfx9 does bounds checking differently depending on whether
idxen is set;
* there is a combined format arg (dfmt+nfmt)
* there is a combined cachepolicy arg (glc+slc)
* there are now only two offset args: one for the offset that is
included in bounds checking and swizzling, to be split between the
instruction's voffset and immoffset fields, and one for the offset
that is excluded from bounds checking and swizzling, to go into the
instruction's soffset field.
The AMDISD::TBUFFER_* SD nodes always have an index operand, all three
offset operands, combined format operand, combined cachepolicy operand,
and an extra idxen operand.
The tbuffer pseudo- and real instructions now also have a combined
format operand.
The obsolescent llvm.amdgcn.tbuffer.* and llvm.SI.tbuffer.store
intrinsics continue to work.
V2: Separate raw and struct intrinsics.
V3: Moved extract_glc and extract_slc defs to a more sensible place.
V4: Rebased on D49995.
V5: Only two separate offset args instead of three.
V6: Pseudo- and real instructions have joint format operand.
V7: Restored optionality of dfmt and nfmt in assembler.
V8: Addressed minor review comments.
Subscribers: arsenm, kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D49026
Change-Id: If22ad77e349fac3a5d2f72dda53c010377d470d4
llvm-svn: 340268
Diffstat (limited to 'llvm/test')
10 files changed, 613 insertions, 2 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll new file mode 100644 index 00000000000..2c81c537093 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll @@ -0,0 +1,42 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}tbuffer_load_d16_x: +; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + ret half %data +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xy: +; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: +; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll new file mode 100644 index 00000000000..2ee3adbc0de --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll @@ -0,0 +1,92 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_load: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 78, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 63, i32 1) + %vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 22, i32 2) + %vdata_f32 = call <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 22, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3 + ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3 +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs_large +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 73, i32 %soffs, i32 62, i32 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 1, i32 %soffs, i32 77, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen +define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %voffs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52 +define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { +main_body: + %ofs = add i32 %voffs, 52 + %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %ofs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}buffer_load_xy: +; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { + %vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) + %vdata.f = bitcast <2 x i32> %vdata to <2 x float> + ret <2 x float> %vdata.f +} + +; GCN-LABEL: {{^}}buffer_load_x: +; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { + %vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) + %vdata.f = bitcast i32 %vdata to float + ret float %vdata.f +} + +declare i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll new file mode 100644 index 00000000000..4dbfe94d012 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -0,0 +1,57 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}tbuffer_store_d16_x: +; GCN: s_load_dwordx4 +; GCN: s_load_dword s[[S_LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] +; GCN: tbuffer_store_format_d16_x v[[V_LO]], off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { +main_body: + call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_d16_xy: +; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 + +; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) { +main_body: + call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 + + +; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] +; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 +define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) { +main_body: + call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll new file mode 100644 index 00000000000..51f047d9b64 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll @@ -0,0 +1,75 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_store: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0 +; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 +define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + %in2 = bitcast <4 x float> %2 to <4 x i32> + %in3 = bitcast <4 x float> %3 to <4 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 44, i32 0) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 61, i32 1) + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 2) + call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 78, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_immoffs: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 42, i32 0, i32 117, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42 +define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 42, i32 %soffset, i32 117, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_ofs: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %voffset, i32 0, i32 115, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x1: +; GCN: tbuffer_store_format_x v0, off, s[0:3], dfmt:13, nfmt:7, 0 +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) { +main_body: + %data.i = bitcast float %data to i32 + call void @llvm.amdgcn.raw.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 0, i32 0, i32 125, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x2: +; GCN: tbuffer_store_format_xy v[0:1], off, s[0:3], dfmt:1, nfmt:2, 0 +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data) { +main_body: + %data.i = bitcast <2 x float> %data to <2 x i32> + call void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } + + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll new file mode 100644 index 00000000000..80d4af26e1d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}tbuffer_load_d16_x: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + ret half %data +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xy: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; UNPACKED: tbuffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], dfmt:6, nfmt:1, 0 idxen +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32) + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll new file mode 100644 index 00000000000..5b542bbcfc5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll @@ -0,0 +1,114 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_load: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 idxen glc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen slc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 idxen +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 63, i32 1) + %vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 22, i32 2) + %vdata_f32 = call <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 22, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3 + ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3 +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:42 +define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs_large +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 idxen offset:4095 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} idxen offset:73 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} idxen offset:1 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 73, i32 %soffs, i32 62, i32 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 1, i32 %soffs, i32 77, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +; GCN-LABEL: {{^}}tbuffer_load_idx: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen +define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen +define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen offset:52 +define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { +main_body: + %ofs = add i32 %voffs, 52 + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_both: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen +define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + + +; GCN-LABEL: {{^}}buffer_load_xy: +; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen +define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { + %vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) + %vdata.f = bitcast <2 x i32> %vdata to <2 x float> + ret <2 x float> %vdata.f +} + +; GCN-LABEL: {{^}}buffer_load_x: +; GCN: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 idxen +define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { + %vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) + %vdata.f = bitcast i32 %vdata to float + ret float %vdata.f +} + +declare i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32) +declare <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32) + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll new file mode 100644 index 00000000000..b4e79093dda --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -0,0 +1,57 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX81 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED -check-prefix=GFX9 %s + + +; GCN-LABEL: {{^}}tbuffer_store_d16_x: +; GCN: s_load_dwordx4 +; GCN: s_load_dword{{[x0-9]*}} s{{\[}}[[S_LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]] +; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_d16_xy: +; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: tbuffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen + +; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] +; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen + + +; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] +; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen +define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) { +main_body: + call void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll new file mode 100644 index 00000000000..ea451abd7b7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -0,0 +1,114 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_store: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:12, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], dfmt:13, nfmt:3, 0 idxen glc +; GCN: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen slc +; GCN: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], dfmt:14, nfmt:4, 0 idxen +define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + %in2 = bitcast <4 x float> %2 to <4 x i32> + %in3 = bitcast <4 x float> %3 to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 44, i32 0) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 61, i32 1) + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 2) + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_immoffs: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, 0 idxen offset:42 +define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 117, i32 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: +; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 +; GCN: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} idxen offset:42 +define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 %soffset, i32 117, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_idx: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 47, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_ofs: +; GCN: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], dfmt:3, nfmt:7, 0 idxen offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 115, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_both: +; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 70, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +; GCN-LABEL: {{^}}buffer_store_wait: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen +; VERDE: s_waitcnt expcnt(0) +; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen +; GCN: s_waitcnt vmcnt(0) +; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 63, i32 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) + %data.i = bitcast <4 x float> %data to <4 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 46, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x1: +; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +main_body: + %data.i = bitcast float %data to i32 + call void @llvm.amdgcn.struct.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 125, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x2: +; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { +main_body: + %data.i = bitcast <2 x float> %data to <2 x i32> + call void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } + + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll index abd0ed86ffe..4caa8081530 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll @@ -70,14 +70,14 @@ main_body: ; VERDE: s_waitcnt expcnt(0) ; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GCN: s_waitcnt vmcnt(0) -; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:16, nfmt:2, 0 idxen +; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:14, nfmt:2, 0 idxen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0) %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) %data.i = bitcast <4 x float> %data to <4 x i32> - call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 16, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 14, i32 2, i1 0, i1 0) ret void } diff --git a/llvm/test/MC/AMDGPU/mtbuf.s b/llvm/test/MC/AMDGPU/mtbuf.s index e80d1ed232e..381b4f73c55 100644 --- a/llvm/test/MC/AMDGPU/mtbuf.s +++ b/llvm/test/MC/AMDGPU/mtbuf.s @@ -38,3 +38,18 @@ tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 // SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71] // VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] +// nfmt is optional: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, ttmp1 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe8,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x01,0x1d,0x71] + +// dfmt is optional: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], nfmt:2, ttmp1 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x07,0xe9,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:0, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x03,0xe9,0x00,0x01,0x1d,0x71] + +// nfmt and dfmt can be in either order: +tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], nfmt:2, dfmt:15, ttmp1 +// SICI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x00,0x7f,0xe9,0x00,0x01,0x1d,0x71] +// VI: tbuffer_store_format_xyzw v[1:4], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x01,0x1d,0x71] + |