diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
14 files changed, 1414 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll new file mode 100644 index 00000000000..03def71d591 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll @@ -0,0 +1,115 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}test1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 offen offset:42 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} +define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) { +main_body: + %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %off5 = add i32 %voffset, 42 + %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %off5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 4, i32 8188, i32 0) + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 offen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 offen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 offen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 offen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 offen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 offen glc +define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) { +main_body: + %t1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t2 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) + %t3 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t4 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) + %t5 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t6 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) + %t7 = call i32 @llvm.amdgcn.raw.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t8 = call i32 @llvm.amdgcn.raw.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) + %t9 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %out = bitcast i32 %t9 to float + ret float %out +} + +; Ideally, we would teach tablegen & friends that cmpswap only modifies the +; first vgpr. Since we don't do that yet, the register allocator will have to +; create copies which we don't bother to track here. +; +;CHECK-LABEL: {{^}}test3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 offen offset:44 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc +define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { +main_body: + %o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %ofs.5 = add i32 %voffset, 44 + %o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %ofs.5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 4, i32 8188, i32 0) + +; Detecting the no-return variant doesn't work right now because of how the +; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. +; Since there probably isn't a reasonable use-case of cmpswap that discards +; the return value, that seems okay. +; +; %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test4: +;CHECK: buffer_atomic_add v0, +define amdgpu_ps float @test4() { +main_body: + %v = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> undef, i32 4, i32 0, i32 0) + %v.float = bitcast i32 %v to float + ret float %v.float +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.and(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.or(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll new file mode 100644 index 00000000000..fb28bc0748b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.d16.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}buffer_load_format_d16_x: +; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 +define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret half %data +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xy: +; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: +; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32>, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll new file mode 100644 index 00000000000..6f4ffbfd0d7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll @@ -0,0 +1,87 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 +;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc +;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 42, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092 +;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092 +;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %d.0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4092, i32 60, i32 0) + %d.1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4092, i32 32764, i32 0) + %d.2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4, i32 36860, i32 0) + %d.3 = fadd <4 x float> %d.0, %d.1 + %data = fadd <4 x float> %d.2, %d.3 + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 60 + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x: +;CHECK: buffer_load_format_x v0, off, s[0:3], 0 +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_xy: +;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0 +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret <2 x float> %data +} + +declare float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32>, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll new file mode 100644 index 00000000000..797700e5b4f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -0,0 +1,206 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc +;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 2) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc +;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 60 + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x1: +;CHECK: buffer_load_dword v0, v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) { +main_body: + %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_x2: +;CHECK: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) { +main_body: + %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) + ret <2 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_negative_offset: +;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, 0xfffff000, v0 +;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen offset:4080 +define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +main_body: + %ofs.1 = add i32 %ofs, -16 + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %ofs.1, i32 0, i32 0) + ret <4 x float> %data +} + +; SI won't merge ds memory operations, because of the signed offset bug, so +; we only have check lines for VI. +; CHECK-LABEL: buffer_load_mmo: +; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) { +entry: + store float 0.0, float addrspace(3)* %lds + %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) + %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 + store float 0.0, float addrspace(3)* %tmp2 + ret float %val +} + +;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +main_body: + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { +main_body: + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a3, i32 0, i32 1) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a4, i32 0, i32 1) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a5, i32 0, i32 3) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %a6, i32 0, i32 3) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { +main_body: + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + %r3 = extractelement <2 x float> %vr2, i32 0 + %r4 = extractelement <2 x float> %vr2, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK: s_waitcnt +define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) + %vr2 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) + %r1 = extractelement <2 x float> %vr1, i32 0 + %r2 = extractelement <2 x float> %vr1, i32 1 + %r3 = extractelement <2 x float> %vr2, i32 0 + %r4 = extractelement <2 x float> %vr2, i32 1 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + ret void +} + +declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll new file mode 100644 index 00000000000..139496282ad --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll @@ -0,0 +1,57 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX81 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s + +; GCN-LABEL: {{^}}buffer_store_format_d16_x: +; GCN: s_load_dword s[[LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] +; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %voffset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xy: + +; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen + +; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %voffset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] + +; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen + +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] + +; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) +declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll new file mode 100644 index 00000000000..ed244397223 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.ll @@ -0,0 +1,76 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 +;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc +;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen +;VERDE: s_waitcnt expcnt(0) +;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 offen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll new file mode 100644 index 00000000000..64865115966 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -0,0 +1,151 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc +;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +;VERDE: s_waitcnt expcnt(0) +;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { +main_body: + call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) #0 { +main_body: + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 +define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} +define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 8 + %a3 = add i32 %a, 12 + %a4 = add i32 %a, 16 + %a5 = add i32 %a, 28 + %a6 = add i32 %a, 32 + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 1) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 3) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 3) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 +define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { + %a1 = add i32 %a, 4 + %a2 = add i32 %a, 12 + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll new file mode 100644 index 00000000000..271e8681b38 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll @@ -0,0 +1,127 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}test1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen offset:42 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen{{$}} +define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { +main_body: + %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0) + %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0) + %ofs.5 = add i32 %voffset, 42 + %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0) + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc{{$}} +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc slc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc +define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { +main_body: + %t1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t2 = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t3 = call i32 @llvm.amdgcn.struct.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t4 = call i32 @llvm.amdgcn.struct.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t5 = call i32 @llvm.amdgcn.struct.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t6 = call i32 @llvm.amdgcn.struct.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t7 = call i32 @llvm.amdgcn.struct.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t8 = call i32 @llvm.amdgcn.struct.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) + %t9 = call i32 @llvm.amdgcn.struct.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %out = bitcast i32 %t9 to float + ret float %out +} + +; Ideally, we would teach tablegen & friends that cmpswap only modifies the +; first vgpr. Since we don't do that yet, the register allocator will have to +; create copies which we don't bother to track here. +; +;CHECK-LABEL: {{^}}test3: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen offset:44 glc +;CHECK-DAG: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc +define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { +main_body: + %o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0) + %o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0) + %offs.5 = add i32 %voffset, 44 + %o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %offs.5, i32 0, i32 0) + %o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0) + +; Detecting the no-return variant doesn't work right now because of how the +; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG. +; Since there probably isn't a reasonable use-case of cmpswap that discards +; the return value, that seems okay. +; +; %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %out = bitcast i32 %o6 to float + ret float %out +} + +;CHECK-LABEL: {{^}}test4: +;CHECK: buffer_atomic_add v0, +define amdgpu_ps float @test4() { +main_body: + %v = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> undef, i32 0, i32 4, i32 0, i32 0) + %v.float = bitcast i32 %v to float + ret float %v.float +} + +declare i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.and(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.or(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll new file mode 100644 index 00000000000..91188e8607b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.d16.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s + +; GCN-LABEL: {{^}}buffer_load_format_d16_x: +; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret half %data +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xy: +; UNPACKED: buffer_load_format_d16_xy v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] +define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %elt = extractelement <2 x half> %data, i32 1 + ret half %elt +} + +; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: +; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] + +; PACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] +define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) { +main_body: + %data = call <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %elt = extractelement <4 x half> %data, i32 3 + ret half %elt +} + +declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32, i32) +declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll new file mode 100644 index 00000000000..6aee3827e9c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll @@ -0,0 +1,115 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_load_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_load_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 60 idxen offset:4092 +;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[OFS1]] idxen offset:4092 +;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc +;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[OFS2]] idxen offset:4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %d.0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 60, i32 0) + %d.1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 32764, i32 0) + %d.2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4, i32 36860, i32 0) + %d.3 = fadd <4 x float> %d.0, %d.1 + %data = fadd <4 x float> %d.2, %d.3 + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_idx: +;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 60 + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both: +;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both_reversed: +;CHECK: v_mov_b32_e32 v2, v0 +;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x: +;CHECK: buffer_load_format_x v0, {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { +main_body: + %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_xy: +;CHECK: buffer_load_format_xy v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { +main_body: + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret <2 x float> %data +} + +declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll new file mode 100644 index 00000000000..cd373e0796b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -0,0 +1,133 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI + +;CHECK-LABEL: {{^}}buffer_load: +;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +;CHECK: s_waitcnt +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + %data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + %data_slc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs: +;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_immoffs_large: +;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc +;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 4, i32 8188, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_idx: +;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs: +;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_ofs_imm: +;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { +main_body: + %ofs = add i32 %1, 60 + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both: +;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_both_reversed: +;CHECK: v_mov_b32_e32 v2, v0 +;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { +main_body: + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) + ret <4 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_x1: +;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + ret float %data +} + +;CHECK-LABEL: {{^}}buffer_load_x2: +;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen +;CHECK: s_waitcnt +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { +main_body: + %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) + ret <2 x float> %data +} + +;CHECK-LABEL: {{^}}buffer_load_negative_offset: +;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, 0xfffff000, v0 +;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen offset:4080 +define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +main_body: + %ofs.1 = add i32 %ofs, -16 + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i32 0, i32 0) + ret <4 x float> %data +} + +; SI won't merge ds memory operations, because of the signed offset bug, so +; we only have check lines for VI. +; CHECK-LABEL: buffer_load_mmo: +; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, float addrspace(3)* %lds) { +entry: + store float 0.0, float addrspace(3)* %lds + %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 + store float 0.0, float addrspace(3)* %tmp2 + ret float %val +} + +declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll new file mode 100644 index 00000000000..b3e10d671c3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll @@ -0,0 +1,57 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX81 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,GFX9 %s + +; GCN-LABEL: {{^}}buffer_store_format_d16_x: +; GCN: s_load_dword s[[LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] +; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xy: + +; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] +; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] +; UNPACKED: buffer_store_format_d16_xy v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen + +; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10 + +; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] + +; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] +; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] + +; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen + +; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] +; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] + +; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen +define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) +declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll new file mode 100644 index 00000000000..0b145540293 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.ll @@ -0,0 +1,104 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_store_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_store_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both_reversed: +;CHECK: v_mov_b32_e32 v6, v4 +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen +;VERDE: s_waitcnt expcnt(0) +;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.format.f32(float, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll new file mode 100644 index 00000000000..8e05c43889a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -0,0 +1,104 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen +;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc +;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 2) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_both_reversed: +;CHECK: v_mov_b32_e32 v6, v4 +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +;VERDE: s_waitcnt expcnt(0) +;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +main_body: + call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt +;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +main_body: + call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) + ret void +} + +declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } |