diff options
-rw-r--r-- | llvm/lib/Target/AMDGPU/BUFInstructions.td | 48 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIIntrinsics.td | 14 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll | 54 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/mubuf.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll | 4 |
5 files changed, 5 insertions, 121 deletions
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index e48b73b0f1e..af45028cb26 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1422,54 +1422,6 @@ defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>; defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>; } - -// BUFFER_LOAD_DWORD*, addr64=0 -multiclass MUBUF_Load_Dword <ValueType vt, - MUBUF_Pseudo offset, - MUBUF_Pseudo offen, - MUBUF_Pseudo idxen, - MUBUF_Pseudo bothen> { - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, - imm:$offset, 0, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 1, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 0, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, - imm:$offset, 1, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; -} - -defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN, - BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>; -defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN, - BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>; -defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, - BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; - multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_st> { // Store follows atomic op convention so address is forst diff --git a/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/llvm/lib/Target/AMDGPU/SIIntrinsics.td index 7b7cf163505..3892f19d3df 100644 --- a/llvm/lib/Target/AMDGPU/SIIntrinsics.td +++ b/llvm/lib/Target/AMDGPU/SIIntrinsics.td @@ -34,18 +34,4 @@ let TargetPrefix = "SI", isTarget = 1 in { llvm_i32_ty], // tfe(imm) []>; - // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed - def int_SI_buffer_load_dword : Intrinsic < - [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - [IntrReadMem, IntrArgMemOnly]>; - } // End TargetPrefix = "SI", isTarget = 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll deleted file mode 100644 index 1d370aba6da..00000000000 --- a/llvm/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll +++ /dev/null @@ -1,54 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s - -; Example of a simple geometry shader loading vertex attributes from the -; ESGS ring buffer - -; FIXME: Out of bounds immediate offset crashes - -; CHECK-LABEL: {{^}}main: -; CHECK: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen glc slc -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen glc slc -; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc - -define amdgpu_vs void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <32 x i8>] addrspace(4)* byval %arg2, [2 x <4 x i32>] addrspace(4)* byval %arg3, [17 x <4 x i32>] addrspace(4)* inreg %arg4, [17 x <4 x i32>] addrspace(4)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) { -main_body: - %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(4)* %arg3, i64 0, i32 1 - %tmp10 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 - %tmp11 = shl i32 %arg6, 2 - %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0) - %tmp13 = bitcast i32 %tmp12 to float - %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0) - %tmp15 = bitcast i32 %tmp14 to float - %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0) - %tmp17 = bitcast i32 %tmp16 to float - %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp19 = bitcast i32 %tmp18 to float - - %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp21 = bitcast i32 %tmp20 to float - - %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0) - %tmp23 = bitcast i32 %tmp22 to float - - call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp13, float %tmp15, float %tmp17, float %tmp19, i1 false, i1 false) - call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp21, float %tmp23, float %tmp23, float %tmp23, i1 true, i1 false) - ret void -} - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0 - -declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 - -attributes #0 = { nounwind readonly } -attributes #1 = { nounwind inaccessiblememonly } - -!0 = !{!"const", !1, i32 1} -!1 = !{!"tbaa root"} diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll index d5c22d22cca..9cc48b41d6d 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll @@ -60,7 +60,7 @@ main_body: %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 1) %tmp4 = add i32 %6, 16 %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) @@ -79,7 +79,7 @@ main_body: %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(4)* %0, i32 0, i32 0 %tmp1 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp0 %tmp2 = shl i32 %6, 2 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 1) %tmp4 = add i32 %6, 16 %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32> call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) @@ -176,7 +176,7 @@ define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { ret void } -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.load.dword.i32(<4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) attributes #0 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll index 5edc2c5c9b7..d93cde6a885 100644 --- a/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -24,7 +24,7 @@ main_body: %array_vector9 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp1, i32 1 %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2 %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3 - %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) + %tmp3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0) call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1) %bc = bitcast <4 x float> %array_vector3 to <4 x i32> %tmp4 = extractelement <4 x i32> %bc, i32 undef @@ -46,7 +46,7 @@ main_body: } declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2 +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3 attributes #0 = { nounwind "target-cpu"="tonga" } |