diff options
| author | Nicolai Haehnle <nhaehnle@gmail.com> | 2018-06-21 13:37:19 +0000 | 
|---|---|---|
| committer | Nicolai Haehnle <nhaehnle@gmail.com> | 2018-06-21 13:37:19 +0000 | 
| commit | 1045928aab8d33bcef57e004ebdd5482468922e6 (patch) | |
| tree | 53c7e203d7c333530c556bae6b15621f2e62f10c /llvm/test/CodeGen/AMDGPU | |
| parent | 7a9c03f484fee744b1cbcc45e77f949b4ebfcbee (diff) | |
| download | bcm5719-llvm-1045928aab8d33bcef57e004ebdd5482468922e6.tar.gz bcm5719-llvm-1045928aab8d33bcef57e004ebdd5482468922e6.zip  | |
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
  the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
  because %tmp1 was not used; remove the load, because it doesn't work
  (Because of the amdgpu_ps calling convention? In any case, it's
  orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
24 files changed, 772 insertions, 361 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll index e967723384b..c9905c1cb77 100644 --- a/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-invalid-copy.ll @@ -1,13 +1,13 @@  ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s  ; GCN-LABEL: {{^}}adjust_writemask_crash_0_nochain: -; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2 +; GCN: image_get_lod v0, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2  ; GCN-NOT: v1  ; GCN-NOT: v0  ; GCN: buffer_store_dword v0  define amdgpu_ps void @adjust_writemask_crash_0_nochain() #0 {  main_body: -  %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp = call <2 x float> @llvm.amdgcn.image.getlod.1d.v2f32.f32(i32 3, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp1 = bitcast <2 x float> %tmp to <2 x i32>    %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>    %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> @@ -17,13 +17,13 @@ main_body:  }  ; GCN-LABEL: {{^}}adjust_writemask_crash_1_nochain: -; GCN: image_get_lod v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1 +; GCN: image_get_lod v0, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1  ; GCN-NOT: v1  ; GCN-NOT: v0  ; GCN: buffer_store_dword v0  define amdgpu_ps void @adjust_writemask_crash_1_nochain() #0 {  main_body: -  %tmp = call <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp = call <2 x float> @llvm.amdgcn.image.getlod.1d.v2f32.f32(i32 3, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp1 = bitcast <2 x float> %tmp to <2 x i32>    %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>    %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> @@ -33,13 +33,13 @@ main_body:  }  ; GCN-LABEL: {{^}}adjust_writemask_crash_0_chain: -; GCN: image_sample v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2 +; GCN: image_sample v0, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x2  ; GCN-NOT: v1  ; GCN-NOT: v0  ; GCN: buffer_store_dword v0  define amdgpu_ps void @adjust_writemask_crash_0_chain() #0 {  main_body: -  %tmp = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp1 = bitcast <2 x float> %tmp to <2 x i32>    %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>    %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> @@ -49,13 +49,13 @@ main_body:  }  ; GCN-LABEL: {{^}}adjust_writemask_crash_1_chain: -; GCN: image_sample v0, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1 +; GCN: image_sample v0, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1  ; GCN-NOT: v1  ; GCN-NOT: v0  ; GCN: buffer_store_dword v0  define amdgpu_ps void @adjust_writemask_crash_1_chain() #0 {  main_body: -  %tmp = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp1 = bitcast <2 x float> %tmp to <2 x i32>    %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>    %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> @@ -66,7 +66,7 @@ main_body:  define amdgpu_ps void @adjust_writemask_crash_0_v4() #0 {  main_body: -  %tmp = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 5, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 5, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp1 = bitcast <4 x float> %tmp to <4 x i32>    %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>    %tmp3 = bitcast <4 x i32> %tmp2 to <4 x float> @@ -76,9 +76,9 @@ main_body:  } -declare <2 x float> @llvm.amdgcn.image.sample.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.image.getlod.v2f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.getlod.1d.v2f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1  attributes #0 = { nounwind }  attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll index 84d8bf2bd70..415a3156699 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -7,7 +7,7 @@  define amdgpu_ps float @main(float %arg0, float %arg1) #0 {  bb:    %tmp = fptosi float %arg0 to i32 -  %tmp1 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false) +  %tmp1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)    %tmp2.f = extractelement <4 x float> %tmp1, i32 0    %tmp2 = bitcast float %tmp2.f to i32    %tmp3 = and i32 %tmp, 7 @@ -21,7 +21,7 @@ bb:  }  declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 -declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2  attributes #0 = { nounwind }  attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index 61ad224e4d7..4522497a85e 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -204,7 +204,7 @@ define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 {  ; GCN: v_readfirstlane_b32  ; GCN-NEXT: v_readfirstlane_b32  ; SI: s_nop -; GCN-NEXT: s_load_dwordx8 +; GCN: s_load_dwordx8  ; GCN-NEXT: s_load_dwordx4  ; GCN: image_sample  define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 { @@ -219,7 +219,7 @@ main_body:    %29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)*    %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28, !amdgpu.uniform !0    %31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0 -  %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8 +  %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %31, i1 0, i32 0, i32 0) #8    %33 = extractelement <4 x float> %32, i32 0    %34 = extractelement <4 x float> %32, i32 1    %35 = extractelement <4 x float> %32, i32 2 @@ -238,7 +238,7 @@ main_body:  ; GCN: v_readfirstlane_b32  ; GCN-NEXT: v_readfirstlane_b32  ; SI: s_nop -; GCN-NEXT: s_load_dwordx8 +; GCN: s_load_dwordx8  ; GCN-NEXT: s_load_dwordx4  ; GCN: image_sample  define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler_nouniform([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 { @@ -253,7 +253,7 @@ main_body:    %29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)*    %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28    %31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0 -  %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8 +  %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %31, i1 0, i32 0, i32 0) #8    %33 = extractelement <4 x float> %32, i32 0    %34 = extractelement <4 x float> %32, i32 1    %35 = extractelement <4 x float> %32, i32 2 @@ -272,7 +272,7 @@ main_body:  declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6  ; Function Attrs: nounwind readonly -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #7 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #7  !0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll index c73ea936e8b..38c9379fe2d 100644 --- a/llvm/test/CodeGen/AMDGPU/else.ll +++ b/llvm/test/CodeGen/AMDGPU/else.ll @@ -44,7 +44,7 @@ if:  else:    %c = fmul float %v, 3.0 -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %v.else = extractelement <4 x float> %tex, i32 0    br label %end @@ -55,7 +55,7 @@ end:  }  declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2  attributes #0 = { nounwind }  attributes #1 = { nounwind writeonly } diff --git a/llvm/test/CodeGen/AMDGPU/image-schedule.ll b/llvm/test/CodeGen/AMDGPU/image-schedule.ll index 6f8060f1d55..990b736262f 100644 --- a/llvm/test/CodeGen/AMDGPU/image-schedule.ll +++ b/llvm/test/CodeGen/AMDGPU/image-schedule.ll @@ -25,17 +25,18 @@ define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg, i32 inreg %arg1    %tmp10 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 32    %tmp11 = bitcast i8 addrspace(4)* %tmp10 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0    %tmp12 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp11, align 16 -  %tmp13 = shufflevector <3 x i32> %tmp9, <3 x i32> undef, <2 x i32> <i32 0, i32 1> -  %tmp14 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp13, <8 x i32> %tmp12, i32 15, i1 false, i1 false, i1 false, i1 false) #0 +  %tmp13.0 = extractelement <3 x i32> %tmp9, i32 0 +  %tmp13.1 = extractelement <3 x i32> %tmp9, i32 1 +  %tmp14 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp12, i32 0, i32 0) #0    %tmp15 = inttoptr i64 %tmp7 to <8 x i32> addrspace(4)*    %tmp16 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16 -  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %tmp14, <2 x i32> %tmp13, <8 x i32> %tmp16, i32 15, i1 false, i1 false, i1 false, i1 false) #0 +  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %tmp14, i32 15, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp16, i32 0, i32 0) #0    %tmp17 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16 -  %tmp18 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp13, <8 x i32> %tmp17, i32 15, i1 false, i1 false, i1 false, i1 false) #0 +  %tmp18 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 165, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp17, i32 0, i32 0) #0    %tmp19 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 64    %tmp20 = bitcast i8 addrspace(4)* %tmp19 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0    %tmp21 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp20, align 16 -  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %tmp18, <2 x i32> %tmp13, <8 x i32> %tmp21, i32 15, i1 false, i1 false, i1 false, i1 false) #0 +  call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %tmp18, i32 15, i32 %tmp13.0, i32 %tmp13.1, <8 x i32> %tmp21, i32 0, i32 0) #0    ret void  } @@ -43,10 +44,10 @@ define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg, i32 inreg %arg1  declare i64 @llvm.amdgcn.s.getpc() #1  ; Function Attrs: nounwind readonly -declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #2  ; Function Attrs: nounwind writeonly -declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3 +declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #3  attributes #0 = { nounwind }  attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index ea3d466b9bb..59cf494d20a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -75,10 +75,9 @@ define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out,  ; GCN-LABEL: {{^}}insertelement_to_sgpr:  ; GCN-NOT: v_readfirstlane -define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind { -  %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef -  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 -  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 false, i1 false, i1 false, i1 false, i1 true) +define amdgpu_ps <4 x float> @insertelement_to_sgpr(<4 x i32> inreg %samp) nounwind { +  %tmp1 = insertelement <4 x i32> %samp, i32 0, i32 0 +  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)    ret <4 x float> %tmp2  } @@ -474,7 +473,7 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)    ret void  } -declare <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1  attributes #0 = { nounwind }  attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll index bc0356a8a3a..efac4702578 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll @@ -1,3 +1,4 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s  ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s  ; GCN-LABEL: {{^}}atomic_swap_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index 671a5a6f05a..bf93ffa937a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s  ; GCN-LABEL: {{^}}load_1d:  ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} @@ -370,6 +370,46 @@ main_body:    ret void  } +; GCN-LABEL: {{^}}getresinfo_dmask0: +; GCN-NOT: image +; GCN: ; return to shader part epilog +define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %mip) #0 { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 0, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) +  ret <4 x float> %r +} + +; Ideally, the register allocator would avoid the wait here +; +; GCN-LABEL: {{^}}image_store_wait: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +; SI: s_waitcnt expcnt(0) +; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; GCN: s_waitcnt vmcnt(0) +; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm +define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 { +main_body: +  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %arg3, i32 15, i32 %arg4, <8 x i32> %arg, i32 0, i32 0) +  %data = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %arg4, <8 x i32> %arg1, i32 0, i32 0) +  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %data, i32 15, i32 %arg4, <8 x i32> %arg2, i32 0, i32 0) +  ret void +} + +; SI won't merge ds memory operations, because of the signed offset bug, so +; we only have check lines for VI. +; VI-LABEL: image_load_mmo +; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)* %lds, <2 x i32> %c) #0 { +  store float 0.000000e+00, float addrspace(3)* %lds +  %c0 = extractelement <2 x i32> %c, i32 0 +  %c1 = extractelement <2 x i32> %c, i32 1 +  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 15, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0) +  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 +  store float 0.000000e+00, float addrspace(3)* %tmp2 +  ret float %tex +} +  declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1  declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1  declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 @@ -412,6 +452,7 @@ declare <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32, i32, <8  declare <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2  declare float @llvm.amdgcn.image.load.1d.f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare float @llvm.amdgcn.image.load.2d.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1  declare <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32, i32, <8 x i32>, i32, i32) #1  declare void @llvm.amdgcn.image.store.1d.f32.i32(float, i32, i32, <8 x i32>, i32, i32) #0  declare void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float>, i32, i32, <8 x i32>, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll index f7fe050feb9..59834320ac4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.dim.ll @@ -1,3 +1,4 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s  ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s  ; GCN-LABEL: {{^}}gather4_2d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.o.dim.ll new file mode 100644 index 00000000000..712c2991746 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.o.dim.ll @@ -0,0 +1,118 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}gather4_o_2d: +; GCN: image_gather4_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_o_2d: +; GCN: image_gather4_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_cl_o_2d: +; GCN: image_gather4_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_cl_o_2d: +; GCN: image_gather4_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_b_o_2d: +; GCN: image_gather4_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_b_o_2d: +; GCN: image_gather4_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_b_cl_o_2d: +; GCN: image_gather4_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_b_cl_o_2d: +; GCN: image_gather4_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_l_o_2d: +; GCN: image_gather4_l_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_l_o_2d: +; GCN: image_gather4_c_l_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_lz_o_2d: +; GCN: image_gather4_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_lz_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_lz_o_2d: +; GCN: image_gather4_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps <4 x float> @gather4_c_lz_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll index b85b343a882..65f4b46d0ae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -1,3 +1,4 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s  ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s  ; GCN-LABEL: {{^}}sample_1d: @@ -400,6 +401,95 @@ main_body:    ret <4 x float> %v  } +; GCN-LABEL: {{^}}adjust_writemask_sample_0: +; GCN: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1{{$}} +define amdgpu_ps float @adjust_writemask_sample_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  %elt0 = extractelement <4 x float> %r, i32 0 +  ret float %elt0 +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_01 +; GCN: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3{{$}} +define amdgpu_ps <2 x float> @adjust_writemask_sample_01(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> <i32 0, i32 1> +  ret <2 x float> %out +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_012 +; GCN: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7{{$}} +define amdgpu_ps <3 x float> @adjust_writemask_sample_012(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  %out = shufflevector <4 x float> %r, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> +  ret <3 x float> %out +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_12 +; GCN: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6{{$}} +define amdgpu_ps <2 x float> @adjust_writemask_sample_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> <i32 1, i32 2> +  ret <2 x float> %out +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_03 +; GCN: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9{{$}} +define amdgpu_ps <2 x float> @adjust_writemask_sample_03(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> <i32 0, i32 3> +  ret <2 x float> %out +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_13 +; GCN: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa{{$}} +define amdgpu_ps <2 x float> @adjust_writemask_sample_13(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> <i32 1, i32 3> +  ret <2 x float> %out +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_123 +; GCN: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe{{$}} +define amdgpu_ps <3 x float> @adjust_writemask_sample_123(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  %out = shufflevector <4 x float> %r, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3> +  ret <3 x float> %out +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_none_enabled +; GCN-NOT: image +define amdgpu_ps <4 x float> @adjust_writemask_sample_none_enabled(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 0, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %r +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_123_to_12 +; GCN: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6{{$}} +define amdgpu_ps <2 x float> @adjust_writemask_sample_123_to_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 14, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> <i32 0, i32 1> +  ret <2 x float> %out +} + +; GCN-LABEL: {{^}}adjust_writemask_sample_013_to_13 +; GCN: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa{{$}} +define amdgpu_ps <2 x float> @adjust_writemask_sample_013_to_13(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: +  %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 11, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  %out = shufflevector <4 x float> %r, <4 x float> undef, <2 x i32> <i32 1, i32 2> +  ret <2 x float> %out +} +  declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1  declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1  declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll new file mode 100644 index 00000000000..fc6ce8faa0c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.dim.ll @@ -0,0 +1,371 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}sample_o_1d: +; GCN: image_sample_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_o_2d: +; GCN: image_sample_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_o_1d: +; GCN: image_sample_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_o_2d: +; GCN: image_sample_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cl_o_1d: +; GCN: image_sample_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cl_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cl_o_2d: +; GCN: image_sample_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cl_o_1d: +; GCN: image_sample_c_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cl_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cl_o_2d: +; GCN: image_sample_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_o_1d: +; GCN: image_sample_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_b_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_o_2d: +; GCN: image_sample_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_o_1d: +; GCN: image_sample_c_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_b_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_o_2d: +; GCN: image_sample_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_b_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_cl_o_1d: +; GCN: image_sample_b_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_b_cl_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_b_cl_o_2d: +; GCN: image_sample_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_cl_o_1d: +; GCN: image_sample_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_b_cl_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_b_cl_o_2d: +; GCN: image_sample_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_b_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_o_1d: +; GCN: image_sample_d_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_d_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %dsdh, float %dsdv, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_o_2d: +; GCN: image_sample_d_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_d_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.d.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_o_1d: +; GCN: image_sample_c_d_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_d_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_o_2d: +; GCN: image_sample_c_d_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_d_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_cl_o_1d: +; GCN: image_sample_d_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_d_cl_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_d_cl_o_2d: +; GCN: image_sample_d_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_d_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_cl_o_1d: +; GCN: image_sample_c_d_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_d_cl_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_d_cl_o_2d: +; GCN: image_sample_c_d_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_d_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_o_1d: +; GCN: image_sample_cd_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cd_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %dsdh, float %dsdv, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_o_2d: +; GCN: image_sample_cd_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cd_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_o_1d: +; GCN: image_sample_c_cd_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_o_2d: +; GCN: image_sample_c_cd_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_cl_o_1d: +; GCN: image_sample_cd_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cd_cl_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_cd_cl_o_2d: +; GCN: image_sample_cd_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_cd_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_cl_o_1d: +; GCN: image_sample_c_cd_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_cl_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_cd_cl_o_2d: +; GCN: image_sample_c_cd_cl_o v[0:3], v[0:15], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_cd_cl_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32(i32 15, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_l_o_1d: +; GCN: image_sample_l_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %lod) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_l_o_2d: +; GCN: image_sample_l_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_o_1d: +; GCN: image_sample_c_l_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %lod) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_o_2d: +; GCN: image_sample_c_l_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_lz_o_1d: +; GCN: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_lz_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_lz_o_2d: +; GCN: image_sample_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_lz_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_lz_o_1d: +; GCN: image_sample_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_lz_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_lz_o_2d: +; GCN: image_sample_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_lz_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t) { +main_body: +  %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) +  ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cl.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.b.o.1d.v4f32.f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.1d.v4f32.f32.f32(i32, i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.2d.v4f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.lz.o.1d.v4f32.f32(i32, i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.lz.o.2d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.lz.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll index 3061bd91c9c..7aa1597e20c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -26,7 +26,7 @@ define amdgpu_ps float @test2() #0 {    %live = call i1 @llvm.amdgcn.ps.live()    %live.32 = zext i1 %live to i32    %live.32.bc = bitcast i32 %live.32 to float -  %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %live.32.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %live.32.bc, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %r = extractelement <4 x float> %t, i32 0    ret float %r  } @@ -49,13 +49,13 @@ dead:  end:    %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ]    %tc.bc = bitcast i32 %tc to float -  %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %tc.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tc.bc, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0    %r = extractelement <4 x float> %t, i32 0    ret float %r  }  declare i1 @llvm.amdgcn.ps.live() #1 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2  attributes #0 = { nounwind }  attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll index 066b4dab56c..dbe48067187 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -8,9 +8,9 @@  ; CHECK-NEXT: image_store  ; CHECK-NEXT: s_endpgm  define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) { -  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) +  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %d0, i32 15, i32 %c0, <8 x i32> %rsrc, i32 0, i32 0)    call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00 -  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) +  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %d1, i32 15, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)    ret void  } @@ -24,17 +24,17 @@ define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float>  ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}}  ; CHECK-NEXT: image_store  define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) { -  %t = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) +  %t = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)    call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00    %c.1 = mul i32 %c, 2 -  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %t, i32 %c.1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) +  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %t, i32 15, i32 %c.1, <8 x i32> %rsrc, i32 0, i32 0)    ret void  }  declare void @llvm.amdgcn.s.waitcnt(i32) #0 -declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0  attributes #0 = { nounwind }  attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/print-mir-custom-pseudo.ll b/llvm/test/CodeGen/AMDGPU/print-mir-custom-pseudo.ll index 8746d4e7120..a56ae204005 100644 --- a/llvm/test/CodeGen/AMDGPU/print-mir-custom-pseudo.ll +++ b/llvm/test/CodeGen/AMDGPU/print-mir-custom-pseudo.ll @@ -10,8 +10,8 @@ target triple = "amdgcn--amdpal"  define dllexport amdgpu_ps <2 x float> @_amdgpu_ps_main(i32 inreg, i32 inreg, i32 inreg, i32 inreg, <2 x float>, <2 x float>, <2 x float>, <3 x float>, <2 x float>, <2 x float>, <2 x float>, float, float, float, float, float, i32, i32, i32, i32) local_unnamed_addr {  .entry: -  %res = call <2 x float> @llvm.amdgcn.image.sample.l.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 false, i1 false, i1 false, i1 false, i1 false) +  %res = call <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32 3, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    ret <2 x float> %res  } -declare <2 x float> @llvm.amdgcn.image.sample.l.v2f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) +declare <2 x float> @llvm.amdgcn.image.sample.1d.v2f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll index 014318964bf..be4e9fbdb95 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -81,13 +81,8 @@ main_body:    %j.f.i4 = bitcast i32 %j.i2 to float    %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 1, i32 %arg3) #1    %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 1, i32 %arg3) #1 -  %tmp45 = bitcast float %p2.i to i32 -  %tmp46 = bitcast float %p2.i24 to i32 -  %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0 -  %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1    %tmp39.bc = bitcast <4 x i32> %tmp39 to <4 x i32> -  %a.bc.i = bitcast <2 x i32> %tmp48 to <2 x float> -  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %p2.i, float %p2.i24, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i1 0, i32 0, i32 0)    %tmp50 = extractelement <4 x float> %tmp1, i32 2    %tmp51 = call float @llvm.fabs.f32(float %tmp50)    %tmp52 = fmul float %p2.i18, %p2.i18 @@ -240,14 +235,14 @@ entry:    br i1 %tmp27, label %if, label %else  if:                                               ; preds = %entry -  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0x36D6000000000000, float 0x36DA000000000000>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0x36D6000000000000, float 0x36DA000000000000, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i1 0, i32 0, i32 0)    %val.if.0 = extractelement <4 x float> %tmp1, i32 0    %val.if.1 = extractelement <4 x float> %tmp1, i32 1    %val.if.2 = extractelement <4 x float> %tmp1, i32 2    br label %endif  else:                                             ; preds = %entry -  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0x36C4000000000000, float 0x36CC000000000000>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0x36C4000000000000, float 0x36CC000000000000, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i1 0, i32 0, i32 0)    %val.else.0 = extractelement <4 x float> %tmp2, i32 0    %val.else.1 = extractelement <4 x float> %tmp2, i32 1    %val.else.2 = extractelement <4 x float> %tmp2, i32 2 @@ -352,24 +347,18 @@ bb:    br i1 %tmp36, label %bb38, label %bb80  bb38:                                             ; preds = %bb -  %tmp52 = bitcast float %p2.i to i32 -  %tmp53 = bitcast float %p2.i6 to i32 -  %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0 -  %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1    %tmp56 = bitcast <8 x i32> %tmp26 to <8 x i32> -  %a.bc.i = bitcast <2 x i32> %tmp55 to <2 x float> -  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %p2.i, float %p2.i6, <8 x i32> %tmp56, <4 x i32> %tmp28, i1 0, i32 0, i32 0)    br label %bb71  bb80:                                             ; preds = %bb    %tmp81 = bitcast float %p2.i to i32    %tmp82 = bitcast float %p2.i6 to i32    %tmp82.2 = add i32 %tmp82, 1 -  %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0 -  %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1 +  %tmp83 = bitcast i32 %tmp81 to float +  %tmp84 = bitcast i32 %tmp82.2 to float    %tmp85 = bitcast <8 x i32> %tmp26 to <8 x i32> -  %a.bc.i1 = bitcast <2 x i32> %tmp84 to <2 x float> -  %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp83, float %tmp84, <8 x i32> %tmp85, <4 x i32> %tmp28, i1 0, i32 0, i32 0)    br label %bb71  bb71:                                             ; preds = %bb80, %bb38 @@ -387,7 +376,7 @@ bb:    %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0    %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(4)* %arg, i32 0, i32 %tid    %tmp8 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp7, align 32, !tbaa !0 -  %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 7.500000e-01, float 2.500000e-01, <8 x i32> %tmp8, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp10 = extractelement <4 x float> %tmp, i32 0    %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10)    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 @@ -402,7 +391,7 @@ bb:    %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0    %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i32 0, i32 %tid    %tmp8 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp7, align 16, !tbaa !0 -  %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 7.500000e-01, float 2.500000e-01, <8 x i32> undef, <4 x i32> %tmp8, i1 0, i32 0, i32 0)    %tmp10 = extractelement <4 x float> %tmp, i32 0    %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 @@ -419,7 +408,7 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1  declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0  declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0  declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2  declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1  attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll index 540c8283c94..eca97c40b19 100644 --- a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll +++ b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll @@ -34,14 +34,9 @@ main_body:    %j.f.i4 = bitcast i32 %j.i2 to float    %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #1    %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #1 -  %tmp27 = bitcast float %p2.i to i32 -  %tmp28 = bitcast float %p2.i6 to i32 -  %tmp29 = insertelement <2 x i32> undef, i32 %tmp27, i32 0 -  %tmp30 = insertelement <2 x i32> %tmp29, i32 %tmp28, i32 1    %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32>    %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32> -  %tmp30.bc = bitcast <2 x i32> %tmp30 to <2 x float> -  %tmp31 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp30.bc, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp31 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %p2.i, float %p2.i6, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i1 0, i32 0, i32 0)    %tmp32 = extractelement <4 x float> %tmp31, i32 0    %tmp33 = extractelement <4 x float> %tmp31, i32 1 @@ -57,7 +52,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1  declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1  declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0  declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2  attributes #0 = { nounwind }  attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll index f3f2611855a..683c6695322 100644 --- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -324,14 +324,6 @@ main_body:    %tmp213 = fmul float %tmp205, %tmp191    %tmp214 = fmul float %tmp206, %tmp191    %tmp215 = fmul float -1.000000e+00, %tmp191 -  %tmp216 = bitcast float %tmp135 to i32 -  %tmp217 = bitcast float %tmp181 to i32 -  %tmp218 = bitcast float %tmp136 to i32 -  %tmp219 = bitcast float %tmp182 to i32 -  %tmp220 = insertelement <8 x i32> undef, i32 %tmp216, i32 0 -  %tmp221 = insertelement <8 x i32> %tmp220, i32 %tmp217, i32 1 -  %tmp222 = insertelement <8 x i32> %tmp221, i32 %tmp218, i32 2 -  %tmp223 = insertelement <8 x i32> %tmp222, i32 %tmp219, i32 3    br label %LOOP  LOOP:                                             ; preds = %ENDIF, %main_body @@ -358,14 +350,7 @@ IF:                                               ; preds = %LOOP    br label %LOOP65  ENDIF:                                            ; preds = %LOOP -  %tmp237 = bitcast float %temp28.0 to i32 -  %tmp238 = bitcast float %temp29.0 to i32 -  %tmp239 = insertelement <8 x i32> %tmp223, i32 %tmp237, i32 4 -  %tmp240 = insertelement <8 x i32> %tmp239, i32 %tmp238, i32 5 -  %tmp241 = insertelement <8 x i32> %tmp240, i32 undef, i32 6 -  %tmp242 = insertelement <8 x i32> %tmp241, i32 undef, i32 7 -  %tmp242.bc = bitcast <8 x i32> %tmp242 to <8 x float> -  %tmp243 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp242.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp243 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %tmp135, float %tmp181, float %tmp136, float %tmp182, float %temp28.0, float %temp29.0, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i1 0, i32 0, i32 0)    %tmp244 = extractelement <4 x float> %tmp243, i32 3    %tmp245 = fcmp oge float %temp30.0, %tmp244    %tmp246 = sext i1 %tmp245 to i32 @@ -396,65 +381,20 @@ LOOP65:                                           ; preds = %ENDIF66, %IF    br i1 %tmp262, label %IF67, label %ENDIF66  IF67:                                             ; preds = %LOOP65 -  %tmp263 = bitcast float %tmp135 to i32 -  %tmp264 = bitcast float %tmp181 to i32 -  %tmp265 = bitcast float %tmp136 to i32 -  %tmp266 = bitcast float %tmp182 to i32 -  %tmp267 = bitcast float %temp28.1 to i32 -  %tmp268 = bitcast float %temp29.1 to i32 -  %tmp269 = insertelement <8 x i32> undef, i32 %tmp263, i32 0 -  %tmp270 = insertelement <8 x i32> %tmp269, i32 %tmp264, i32 1 -  %tmp271 = insertelement <8 x i32> %tmp270, i32 %tmp265, i32 2 -  %tmp272 = insertelement <8 x i32> %tmp271, i32 %tmp266, i32 3 -  %tmp273 = insertelement <8 x i32> %tmp272, i32 %tmp267, i32 4 -  %tmp274 = insertelement <8 x i32> %tmp273, i32 %tmp268, i32 5 -  %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6 -  %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7    %tmp67.bc = bitcast <4 x i32> %tmp67 to <4 x i32> -  %tmp276.bc = bitcast <8 x i32> %tmp276 to <8 x float> -  %tmp277 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp276.bc, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp277 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %tmp135, float %tmp181, float %tmp136, float %tmp182, float %temp28.1, float %temp29.1, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i1 0, i32 0, i32 0)    %tmp278 = extractelement <4 x float> %tmp277, i32 0    %tmp279 = extractelement <4 x float> %tmp277, i32 1    %tmp280 = extractelement <4 x float> %tmp277, i32 2    %tmp281 = extractelement <4 x float> %tmp277, i32 3    %tmp282 = fmul float %tmp281, %tmp46 -  %tmp283 = bitcast float %tmp135 to i32 -  %tmp284 = bitcast float %tmp181 to i32 -  %tmp285 = bitcast float %tmp136 to i32 -  %tmp286 = bitcast float %tmp182 to i32 -  %tmp287 = bitcast float %temp28.1 to i32 -  %tmp288 = bitcast float %temp29.1 to i32 -  %tmp289 = insertelement <8 x i32> undef, i32 %tmp283, i32 0 -  %tmp290 = insertelement <8 x i32> %tmp289, i32 %tmp284, i32 1 -  %tmp291 = insertelement <8 x i32> %tmp290, i32 %tmp285, i32 2 -  %tmp292 = insertelement <8 x i32> %tmp291, i32 %tmp286, i32 3 -  %tmp293 = insertelement <8 x i32> %tmp292, i32 %tmp287, i32 4 -  %tmp294 = insertelement <8 x i32> %tmp293, i32 %tmp288, i32 5 -  %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6 -  %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7    %tmp83.bc = bitcast <4 x i32> %tmp83 to <4 x i32> -  %tmp296.bc = bitcast <8 x i32> %tmp296 to <8 x float> -  %tmp297 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp296.bc, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp297 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %tmp135, float %tmp181, float %tmp136, float %tmp182, float %temp28.1, float %temp29.1, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i1 0, i32 0, i32 0)    %tmp298 = extractelement <4 x float> %tmp297, i32 0    %tmp299 = extractelement <4 x float> %tmp297, i32 1    %tmp300 = extractelement <4 x float> %tmp297, i32 2 -  %tmp301 = bitcast float %tmp135 to i32 -  %tmp302 = bitcast float %tmp181 to i32 -  %tmp303 = bitcast float %tmp136 to i32 -  %tmp304 = bitcast float %tmp182 to i32 -  %tmp305 = bitcast float %temp28.1 to i32 -  %tmp306 = bitcast float %temp29.1 to i32 -  %tmp307 = insertelement <8 x i32> undef, i32 %tmp301, i32 0 -  %tmp308 = insertelement <8 x i32> %tmp307, i32 %tmp302, i32 1 -  %tmp309 = insertelement <8 x i32> %tmp308, i32 %tmp303, i32 2 -  %tmp310 = insertelement <8 x i32> %tmp309, i32 %tmp304, i32 3 -  %tmp311 = insertelement <8 x i32> %tmp310, i32 %tmp305, i32 4 -  %tmp312 = insertelement <8 x i32> %tmp311, i32 %tmp306, i32 5 -  %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6 -  %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7    %tmp79.bc = bitcast <4 x i32> %tmp79 to <4 x i32> -  %tmp314.bc = bitcast <8 x i32> %tmp314 to <8 x float> -  %tmp315 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp314.bc, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp315 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %tmp135, float %tmp181, float %tmp136, float %tmp182, float %temp28.1, float %temp29.1, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i1 0, i32 0, i32 0)    %tmp316 = extractelement <4 x float> %tmp315, i32 0    %tmp317 = extractelement <4 x float> %tmp315, i32 1    %tmp318 = extractelement <4 x float> %tmp315, i32 2 @@ -470,22 +410,7 @@ IF67:                                             ; preds = %LOOP65    %tmp328 = fadd float %tmp278, %tmp323    %tmp329 = fadd float %tmp279, %tmp325    %tmp330 = fadd float %tmp280, %tmp327 -  %tmp331 = bitcast float %tmp135 to i32 -  %tmp332 = bitcast float %tmp181 to i32 -  %tmp333 = bitcast float %tmp136 to i32 -  %tmp334 = bitcast float %tmp182 to i32 -  %tmp335 = bitcast float %temp28.1 to i32 -  %tmp336 = bitcast float %temp29.1 to i32 -  %tmp337 = insertelement <8 x i32> undef, i32 %tmp331, i32 0 -  %tmp338 = insertelement <8 x i32> %tmp337, i32 %tmp332, i32 1 -  %tmp339 = insertelement <8 x i32> %tmp338, i32 %tmp333, i32 2 -  %tmp340 = insertelement <8 x i32> %tmp339, i32 %tmp334, i32 3 -  %tmp341 = insertelement <8 x i32> %tmp340, i32 %tmp335, i32 4 -  %tmp342 = insertelement <8 x i32> %tmp341, i32 %tmp336, i32 5 -  %tmp343 = insertelement <8 x i32> %tmp342, i32 undef, i32 6 -  %tmp344 = insertelement <8 x i32> %tmp343, i32 undef, i32 7 -  %tmp344.bc = bitcast <8 x i32> %tmp344 to <8 x float> -  %tmp345 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp344.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp345 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %tmp135, float %tmp181, float %tmp136, float %tmp182, float %temp28.1, float %temp29.1, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i1 0, i32 0, i32 0)    %tmp346 = extractelement <4 x float> %tmp345, i32 0    %tmp347 = extractelement <4 x float> %tmp345, i32 1    %tmp348 = extractelement <4 x float> %tmp345, i32 2 @@ -501,23 +426,8 @@ IF67:                                             ; preds = %LOOP65    %tmp358 = fmul float %tmp349, %tmp357    %tmp359 = fmul float %tmp350, %tmp357    %tmp360 = fmul float %tmp351, %tmp357 -  %tmp361 = bitcast float %tmp135 to i32 -  %tmp362 = bitcast float %tmp181 to i32 -  %tmp363 = bitcast float %tmp136 to i32 -  %tmp364 = bitcast float %tmp182 to i32 -  %tmp365 = bitcast float %temp28.1 to i32 -  %tmp366 = bitcast float %temp29.1 to i32 -  %tmp367 = insertelement <8 x i32> undef, i32 %tmp361, i32 0 -  %tmp368 = insertelement <8 x i32> %tmp367, i32 %tmp362, i32 1 -  %tmp369 = insertelement <8 x i32> %tmp368, i32 %tmp363, i32 2 -  %tmp370 = insertelement <8 x i32> %tmp369, i32 %tmp364, i32 3 -  %tmp371 = insertelement <8 x i32> %tmp370, i32 %tmp365, i32 4 -  %tmp372 = insertelement <8 x i32> %tmp371, i32 %tmp366, i32 5 -  %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6 -  %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7    %tmp71.bc = bitcast <4 x i32> %tmp71 to <4 x i32> -  %tmp374.bc = bitcast <8 x i32> %tmp374 to <8 x float> -  %tmp375 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp374.bc, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp375 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %tmp135, float %tmp181, float %tmp136, float %tmp182, float %temp28.1, float %temp29.1, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i1 0, i32 0, i32 0)    %tmp376 = extractelement <4 x float> %tmp375, i32 0    %tmp377 = extractelement <4 x float> %tmp375, i32 1    %tmp378 = extractelement <4 x float> %tmp375, i32 2 @@ -557,23 +467,8 @@ IF67:                                             ; preds = %LOOP65    %tmp412 = fadd float %tmp411, %tmp406    %tmp413 = fmul float %tmp399, %p2.i24    %tmp414 = fadd float %tmp413, %tmp408 -  %tmp415 = bitcast float %tmp135 to i32 -  %tmp416 = bitcast float %tmp181 to i32 -  %tmp417 = bitcast float %tmp136 to i32 -  %tmp418 = bitcast float %tmp182 to i32 -  %tmp419 = bitcast float %temp28.1 to i32 -  %tmp420 = bitcast float %temp29.1 to i32 -  %tmp421 = insertelement <8 x i32> undef, i32 %tmp415, i32 0 -  %tmp422 = insertelement <8 x i32> %tmp421, i32 %tmp416, i32 1 -  %tmp423 = insertelement <8 x i32> %tmp422, i32 %tmp417, i32 2 -  %tmp424 = insertelement <8 x i32> %tmp423, i32 %tmp418, i32 3 -  %tmp425 = insertelement <8 x i32> %tmp424, i32 %tmp419, i32 4 -  %tmp426 = insertelement <8 x i32> %tmp425, i32 %tmp420, i32 5 -  %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6 -  %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7    %tmp87.bc = bitcast <4 x i32> %tmp87 to <4 x i32> -  %tmp428.bc = bitcast <8 x i32> %tmp428 to <8 x float> -  %tmp429 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp428.bc, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp429 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %tmp135, float %tmp181, float %tmp136, float %tmp182, float %temp28.1, float %temp29.1, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i1 0, i32 0, i32 0)    %tmp430 = extractelement <4 x float> %tmp429, i32 0    %tmp431 = extractelement <4 x float> %tmp429, i32 1    %tmp432 = extractelement <4 x float> %tmp429, i32 2 @@ -617,16 +512,8 @@ IF67:                                             ; preds = %LOOP65    %tmp460 = fadd float %tmp459, 1.500000e+00    %tmp461 = fmul float %tmp454, %tmp458    %tmp462 = fadd float %tmp461, 1.500000e+00 -  %tmp463 = bitcast float %tmp462 to i32 -  %tmp464 = bitcast float %tmp460 to i32 -  %tmp465 = bitcast float %tmp456 to i32 -  %tmp466 = insertelement <4 x i32> undef, i32 %tmp463, i32 0 -  %tmp467 = insertelement <4 x i32> %tmp466, i32 %tmp464, i32 1 -  %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2 -  %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3    %tmp91.bc = bitcast <4 x i32> %tmp91 to <4 x i32> -  %tmp469.bc = bitcast <4 x i32> %tmp469 to <4 x float> -  %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp469.bc, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %tmp462, float %tmp460, float %tmp456, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i1 0, i32 0, i32 0) #0    %tmp471 = extractelement <4 x float> %tmp470, i32 0    %tmp472 = extractelement <4 x float> %tmp470, i32 1    %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -713,23 +600,8 @@ IF67:                                             ; preds = %LOOP65    %tmp554 = fadd float %tmp553, %tmp549    %tmp555 = fmul float %tmp547, %tmp58    %tmp556 = fadd float %tmp555, %tmp550 -  %tmp557 = bitcast float %tmp135 to i32 -  %tmp558 = bitcast float %tmp181 to i32 -  %tmp559 = bitcast float %tmp136 to i32 -  %tmp560 = bitcast float %tmp182 to i32 -  %tmp561 = bitcast float %temp28.1 to i32 -  %tmp562 = bitcast float %temp29.1 to i32 -  %tmp563 = insertelement <8 x i32> undef, i32 %tmp557, i32 0 -  %tmp564 = insertelement <8 x i32> %tmp563, i32 %tmp558, i32 1 -  %tmp565 = insertelement <8 x i32> %tmp564, i32 %tmp559, i32 2 -  %tmp566 = insertelement <8 x i32> %tmp565, i32 %tmp560, i32 3 -  %tmp567 = insertelement <8 x i32> %tmp566, i32 %tmp561, i32 4 -  %tmp568 = insertelement <8 x i32> %tmp567, i32 %tmp562, i32 5 -  %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6 -  %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7    %tmp75.bc = bitcast <4 x i32> %tmp75 to <4 x i32> -  %tmp570.bc = bitcast <8 x i32> %tmp570 to <8 x float> -  %tmp571 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp570.bc, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp571 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %tmp135, float %tmp181, float %tmp136, float %tmp182, float %temp28.1, float %temp29.1, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i1 0, i32 0, i32 0)    %tmp572 = extractelement <4 x float> %tmp571, i32 0    %tmp573 = extractelement <4 x float> %tmp571, i32 1    %tmp574 = extractelement <4 x float> %tmp571, i32 2 @@ -745,14 +617,7 @@ IF67:                                             ; preds = %LOOP65    ret void  ENDIF66:                                          ; preds = %LOOP65 -  %tmp585 = bitcast float %temp28.1 to i32 -  %tmp586 = bitcast float %temp29.1 to i32 -  %tmp587 = insertelement <8 x i32> %tmp236, i32 %tmp585, i32 4 -  %tmp588 = insertelement <8 x i32> %tmp587, i32 %tmp586, i32 5 -  %tmp589 = insertelement <8 x i32> %tmp588, i32 undef, i32 6 -  %tmp590 = insertelement <8 x i32> %tmp589, i32 undef, i32 7 -  %tmp590.bc = bitcast <8 x i32> %tmp590 to <8 x float> -  %tmp591 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp590.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp591 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %tmp135, float %tmp181, float %tmp136, float %tmp182, float %temp28.1, float %temp29.1, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i1 0, i32 0, i32 0)    %tmp592 = extractelement <4 x float> %tmp591, i32 3    %tmp593 = fcmp oge float %temp30.1, %tmp592    %tmp594 = sext i1 %tmp593 to i32 @@ -1140,13 +1005,8 @@ main_body:    %tmp218 = fmul float %., %tmp53    %tmp219 = fmul float %arg13, %tmp46    %tmp220 = fmul float %tmp196, %tmp47 -  %tmp221 = bitcast float %p2.i132 to i32 -  %tmp222 = bitcast float %p2.i126 to i32 -  %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0 -  %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1    %tmp132.bc = bitcast <4 x i32> %tmp132 to <4 x i32> -  %tmp224.bc = bitcast <2 x i32> %tmp224 to <2 x float> -  %tmp225 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp224.bc, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp225 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %p2.i132, float %p2.i126, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i1 0, i32 0, i32 0)    %tmp226 = extractelement <4 x float> %tmp225, i32 0    %tmp227 = extractelement <4 x float> %tmp225, i32 1    %tmp228 = extractelement <4 x float> %tmp225, i32 2 @@ -1220,7 +1080,7 @@ LOOP:                                             ; preds = %LOOP, %main_body    %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3    %tmp148.bc = bitcast <4 x i32> %tmp148 to <4 x i32>    %tmp281.bc = bitcast <4 x i32> %tmp281 to <4 x float> -  %tmp282 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp281.bc, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp282 = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %temp168.0, float %temp169.0, float 0.0, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i1 0, i32 0, i32 0)    %tmp283 = extractelement <4 x float> %tmp282, i32 3    %tmp284 = fadd float %temp168.0, %tmp273    %tmp285 = fadd float %temp169.0, %tmp274 @@ -1279,13 +1139,8 @@ IF189:                                            ; preds = %LOOP    %tmp335 = fadd float %p2.i162, %tmp329    %tmp336 = fadd float %p2.i156, %tmp331    %tmp337 = fadd float %p2.i150, %tmp333 -  %tmp338 = bitcast float %tmp334 to i32 -  %tmp339 = bitcast float %tmp335 to i32 -  %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0 -  %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1    %tmp136.bc = bitcast <4 x i32> %tmp136 to <4 x i32> -  %a.bc.i = bitcast <2 x i32> %tmp341 to <2 x float> -  %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp334, float %tmp335, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i1 0, i32 0, i32 0)    %tmp343 = extractelement <4 x float> %tmp0, i32 0    %tmp344 = extractelement <4 x float> %tmp0, i32 1    %tmp345 = extractelement <4 x float> %tmp0, i32 2 @@ -1313,25 +1168,15 @@ IF189:                                            ; preds = %LOOP    %one.sub.ac.i30 = fmul float %one.sub.a.i29, %tmp353    %mul.i31 = fmul float %tmp345, %tmp353    %result.i32 = fadd float %mul.i31, %one.sub.ac.i30 -  %tmp358 = bitcast float %tmp336 to i32 -  %tmp359 = bitcast float %tmp337 to i32 -  %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0 -  %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1    %tmp152.bc = bitcast <4 x i32> %tmp152 to <4 x i32> -  %a.bc.i3 = bitcast <2 x i32> %tmp361 to <2 x float> -  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i3, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp336, float %tmp337, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i1 0, i32 0, i32 0)    %tmp363 = extractelement <4 x float> %tmp1, i32 2    %tmp364 = fmul float %result.i40, %result.i    %tmp365 = fmul float %result.i36, %result.i44    %tmp366 = fmul float %result.i32, %result.i42    %tmp367 = fmul float %tmp354, %tmp229 -  %tmp368 = bitcast float %tmp310 to i32 -  %tmp369 = bitcast float %tmp311 to i32 -  %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0 -  %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1    %tmp140.bc = bitcast <4 x i32> %tmp140 to <4 x i32> -  %a.bc.i2 = bitcast <2 x i32> %tmp371 to <2 x float> -  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i2, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp310, float %tmp311, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i1 0, i32 0, i32 0)    %tmp373 = extractelement <4 x float> %tmp2, i32 0    %tmp374 = extractelement <4 x float> %tmp2, i32 1    %tmp375 = extractelement <4 x float> %tmp2, i32 2 @@ -1343,13 +1188,8 @@ IF189:                                            ; preds = %LOOP    %tmp381 = icmp ne i32 %tmp380, 0    %.224 = select i1 %tmp381, float %tmp374, float %tmp373    %.225 = select i1 %tmp381, float %tmp376, float %tmp374 -  %tmp382 = bitcast float %tmp320 to i32 -  %tmp383 = bitcast float %tmp321 to i32 -  %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0 -  %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1    %tmp144.bc = bitcast <4 x i32> %tmp144 to <4 x i32> -  %a.bc.i1 = bitcast <2 x i32> %tmp385 to <2 x float> -  %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp320, float %tmp321, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i1 0, i32 0, i32 0)    %tmp387 = extractelement <4 x float> %tmp3, i32 0    %tmp388 = extractelement <4 x float> %tmp3, i32 1    %tmp389 = extractelement <4 x float> %tmp3, i32 2 @@ -1442,13 +1282,8 @@ ENDIF197:                                         ; preds = %IF198, %IF189    %temp14.0 = phi float [ %tmp465, %IF198 ], [ %tmp457, %IF189 ]    %temp13.0 = phi float [ %tmp464, %IF198 ], [ %tmp456, %IF189 ]    %temp12.0 = phi float [ %tmp463, %IF198 ], [ %tmp455, %IF189 ] -  %tmp466 = bitcast float %tmp219 to i32 -  %tmp467 = bitcast float %tmp220 to i32 -  %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0 -  %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1    %tmp160.bc = bitcast <4 x i32> %tmp160 to <4 x i32> -  %tmp469.bc = bitcast <2 x i32> %tmp469 to <2 x float> -  %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp469.bc, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp219, float %tmp220, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i1 0, i32 0, i32 0)    %tmp471 = extractelement <4 x float> %tmp470, i32 0    %tmp472 = extractelement <4 x float> %tmp470, i32 1    %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -1461,13 +1296,8 @@ ENDIF197:                                         ; preds = %IF198, %IF189    %tmp480 = fadd float %tmp479, %tmp40    %tmp481 = fmul float %tmp474, %tmp41    %tmp482 = fadd float %tmp481, %tmp42 -  %tmp483 = bitcast float %p2.i144 to i32 -  %tmp484 = bitcast float %p2.i138 to i32 -  %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0 -  %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1    %tmp156.bc = bitcast <4 x i32> %tmp156 to <4 x i32> -  %tmp486.bc = bitcast <2 x i32> %tmp486 to <2 x float> -  %tmp487 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp486.bc, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp487 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %p2.i144, float %p2.i138, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i1 0, i32 0, i32 0)    %tmp488 = extractelement <4 x float> %tmp487, i32 0    %tmp489 = extractelement <4 x float> %tmp487, i32 1    %tmp490 = extractelement <4 x float> %tmp487, i32 2 @@ -1667,27 +1497,11 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,    %tmp651 = fadd float %tmp650, 1.000000e+00    %max.0.i11 = call float @llvm.maxnum.f32(float %tmp651, float 0.000000e+00)    %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00) -  %tmp653 = bitcast float %tmp642 to i32 -  %tmp654 = bitcast float %tmp644 to i32 -  %tmp655 = bitcast float 0.000000e+00 to i32 -  %tmp656 = insertelement <4 x i32> undef, i32 %tmp653, i32 0 -  %tmp657 = insertelement <4 x i32> %tmp656, i32 %tmp654, i32 1 -  %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2 -  %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3    %tmp128.bc = bitcast <4 x i32> %tmp128 to <4 x i32> -  %tmp659.bc = bitcast <4 x i32> %tmp659 to <4 x float> -  %tmp660 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp659.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp660 = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %tmp642, float %tmp644, float 0.0, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i1 0, i32 0, i32 0)    %tmp661 = extractelement <4 x float> %tmp660, i32 0    %tmp662 = extractelement <4 x float> %tmp660, i32 1 -  %tmp663 = bitcast float %tmp646 to i32 -  %tmp664 = bitcast float %tmp648 to i32 -  %tmp665 = bitcast float 0.000000e+00 to i32 -  %tmp666 = insertelement <4 x i32> undef, i32 %tmp663, i32 0 -  %tmp667 = insertelement <4 x i32> %tmp666, i32 %tmp664, i32 1 -  %tmp668 = insertelement <4 x i32> %tmp667, i32 %tmp665, i32 2 -  %tmp669 = insertelement <4 x i32> %tmp668, i32 undef, i32 3 -  %tmp669.bc = bitcast <4 x i32> %tmp669 to <4 x float> -  %tmp670 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp669.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp670 = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %tmp646, float %tmp648, float 0.0, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i1 0, i32 0, i32 0)    %tmp671 = extractelement <4 x float> %tmp670, i32 0    %tmp672 = extractelement <4 x float> %tmp670, i32 1    %tmp673 = fsub float -0.000000e+00, %tmp662 @@ -1865,10 +1679,10 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1  declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1  declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0  declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2  declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1  attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 7643ebed215..49c171e03de 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -355,7 +355,7 @@ bb7:                                              ; preds = %bb4  ; CHECK: [[END]]:  ; CHECK: s_endpgm -define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x float> %arg2) #0 { +define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 {  bb:    %tmp = fcmp ult float %arg1, 0.000000e+00    br i1 %tmp, label %bb3, label %bb4 @@ -365,7 +365,7 @@ bb3:                                              ; preds = %bb    br label %bb4  bb4:                                              ; preds = %bb3, %bb -  %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 16, float %arg2, float %arg3, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp6 = extractelement <4 x float> %tmp5, i32 0    %tmp7 = fcmp une float %tmp6, 0.000000e+00    br i1 %tmp7, label %bb8, label %bb9 @@ -378,7 +378,7 @@ bb9:                                              ; preds = %bb4    ret void  } -declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1  declare void @llvm.AMDGPU.kill(float) #0  attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/split-smrd.ll b/llvm/test/CodeGen/AMDGPU/split-smrd.ll index 6089492b1c2..46355b762b4 100644 --- a/llvm/test/CodeGen/AMDGPU/split-smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/split-smrd.ll @@ -21,7 +21,7 @@ bb3:                                              ; preds = %bb    %tmp6 = sext i32 %tmp5 to i64    %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(4)* %arg, i64 0, i64 %tmp6    %tmp8 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp7, align 32, !tbaa !0 -  %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float bitcast (i32 1061158912 to float), float bitcast (i32 1048576000 to float)>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float bitcast (i32 1061158912 to float), float bitcast (i32 1048576000 to float), <8 x i32> %tmp8, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp10 = extractelement <4 x float> %tmp9, i32 0    %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)    call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 @@ -30,7 +30,7 @@ bb3:                                              ; preds = %bb  declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1  declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2  declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1  attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll index c2d04abf829..c8571b1381b 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -65,7 +65,7 @@ bb7:                                              ; preds = %bb6    br label %bb4  bb9:                                              ; preds = %bb2 -  %tmp10 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp10 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp11 = extractelement <4 x float> %tmp10, i32 1    %tmp12 = extractelement <4 x float> %tmp10, i32 3    br label %bb14 @@ -97,7 +97,7 @@ bb27:                                             ; preds = %bb24  declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1  attributes #0 = { nounwind }  attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 1e08f51dabd..d4c05fb5682 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -34,9 +34,8 @@ bb:    %tmp = load volatile i32, i32 addrspace(1)* undef, align 4    %tmp1 = load volatile i32, i32 addrspace(1)* undef, align 4    %tmp2 = insertelement <4 x i32> undef, i32 %tmp1, i32 0 -  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1 -  %tmp3.cast = bitcast <4 x i32> %tmp3 to <4 x float> -  %tmp4 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp3.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp3 = bitcast i32 %tmp1 to float +  %tmp4 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %tmp3, float %tmp3, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp5 = extractelement <4 x float> %tmp4, i32 0    %tmp6 = fmul float %tmp5, undef    %tmp7 = fadd float %tmp6, %tmp6 @@ -84,7 +83,7 @@ define amdgpu_kernel void @partially_undef_copy() #0 {    ret void  } -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1  attributes #0 = { nounwind }  attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll b/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll index dcb75f1be26..603b5d42540 100644 --- a/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll @@ -17,25 +17,20 @@ main_body:    %j.f.i = bitcast i32 %j.i to float    %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 3, i32 4, i32 %arg6) #2    %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 3, i32 4, i32 %arg6) #2 -  %tmp23 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp23 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp24 = extractelement <4 x float> %tmp23, i32 3    %tmp25 = fmul float %tmp24, %tmp24    %tmp26 = fmul float %p2.i, %p2.i    %tmp27 = fadd float %tmp26, %tmp26 -  %tmp28 = bitcast float %tmp27 to i32 -  %tmp29 = insertelement <4 x i32> undef, i32 %tmp28, i32 0 -  %tmp30 = insertelement <4 x i32> %tmp29, i32 0, i32 1 -  %tmp31 = insertelement <4 x i32> %tmp30, i32 undef, i32 2 -  %tmp31.cast = bitcast <4 x i32> %tmp31 to <4 x float> -  %tmp32 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp31.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp32 = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %tmp27, float 0.0, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp33 = extractelement <4 x float> %tmp32, i32 0    %tmp34 = fadd float %tmp33, %tmp33    %tmp35 = fadd float %tmp34, %tmp34    %tmp36 = fadd float %tmp35, %tmp35    %tmp37 = fadd float %tmp36, %tmp36    %tmp38 = fadd float %tmp37, %tmp37 -  %tmp39 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp39 = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp40 = extractelement <4 x float> %tmp39, i32 0    %tmp41 = extractelement <4 x float> %tmp39, i32 1    %tmp42 = extractelement <4 x float> %tmp39, i32 2 @@ -53,17 +48,12 @@ main_body:    %tmp54 = insertelement <4 x i32> %tmp53, i32 %tmp51, i32 1    %tmp55 = insertelement <4 x i32> %tmp54, i32 %tmp52, i32 2    %tmp55.cast = bitcast <4 x i32> %tmp55 to <4 x float> -  %tmp56 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp55.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp56 = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %tmp27, float %tmp48, float %tmp49, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp57 = extractelement <4 x float> %tmp56, i32 0    %tmp58 = fadd float %tmp38, %tmp57    %tmp59 = fadd float %tmp46, %tmp46    %tmp60 = fadd float %tmp47, %tmp47 -  %tmp61 = bitcast float %tmp59 to i32 -  %tmp62 = bitcast float %tmp60 to i32 -  %tmp63 = insertelement <4 x i32> undef, i32 %tmp61, i32 1 -  %tmp64 = insertelement <4 x i32> %tmp63, i32 %tmp62, i32 2 -  %tmp64.cast = bitcast <4 x i32> %tmp64 to <4 x float> -  %tmp65 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp64.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp65 = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float undef, float %tmp59, float %tmp60, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp66 = extractelement <4 x float> %tmp65, i32 0    %tmp67 = fadd float %tmp58, %tmp66    %tmp68 = fmul float %tmp67, 1.250000e-01 @@ -101,10 +91,7 @@ IF29:                                             ; preds = %LOOP    br label %ENDIF25  ENDIF28:                                          ; preds = %LOOP -  %tmp85 = insertelement <4 x i32> %tmp72, i32 undef, i32 1 -  %tmp86 = insertelement <4 x i32> %tmp85, i32 undef, i32 2 -  %tmp86.cast = bitcast <4 x i32> %tmp86 to <4 x float> -  %tmp87 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp86.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) +  %tmp87 = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %tmp27, float undef, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0)    %tmp88 = extractelement <4 x float> %tmp87, i32 0    %tmp89 = fadd float %tmp88, %tmp88    br label %LOOP @@ -114,9 +101,8 @@ declare float @llvm.minnum.f32(float, float) #1  declare float @llvm.maxnum.f32(float, float) #1  declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1  declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2  attributes #0 = { nounwind "InitialPSInputAddr"="36983" "target-cpu"="tonga" }  attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index cd3bfef1686..4c9a8d5a938 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -5,10 +5,10 @@  ;  ;CHECK-LABEL: {{^}}test1:  ;CHECK-NOT: s_wqm -define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) { +define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {  main_body: -  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) -  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) +  %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) +  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)    ret <4 x float> %tex  } @@ -30,11 +30,9 @@ main_body:    %inst24 = extractelement <2 x float> %pos, i32 1    %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)    %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) -  %inst27 = insertelement <2 x float> undef, float %inst26, i32 0    %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)    %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) -  %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1 -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    ret <4 x float> %tex  } @@ -49,9 +47,9 @@ main_body:  ;CHECK: store  ;CHECK-NOT: exec  ;CHECK: .size test3 -define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x float> %c) { +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {  main_body: -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    %tex.1 = bitcast <4 x float> %tex to <4 x i32>    %tex.2 = extractelement <4 x i32> %tex.1, i32 0 @@ -77,11 +75,9 @@ main_body:    %inst24 = extractelement <2 x float> %pos, i32 1    %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)    %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) -  %inst27 = insertelement <2 x float> undef, float %inst26, i32 0    %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)    %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) -  %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1 -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    %tex.0 = extractelement <4 x float> %tex, i32 0    %tex.1 = extractelement <4 x float> %tex, i32 1    %tex.2 = extractelement <4 x float> %tex, i32 2 @@ -108,8 +104,9 @@ main_body:    call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)    %c.1.bc = bitcast i32 %c.1 to float -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 +  %tex0 = extractelement <4 x float> %tex, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    ret <4 x float> %dtex  } @@ -361,8 +358,9 @@ main_body:  IF:    %c.bc = bitcast i32 %c to float -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 +  %tex0 = extractelement <4 x float> %tex, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    %data.if = extractelement <4 x float> %dtex, i32 0    br label %END @@ -403,8 +401,9 @@ main_body:  IF:    %c.bc = bitcast i32 %c to float -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 +  %tex0 = extractelement <4 x float> %tex, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    %data.if = extractelement <4 x float> %dtex, i32 0    br label %END @@ -460,7 +459,7 @@ ELSE:  END:    %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]    %coord.END.bc = bitcast i32 %coord.END to float -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    ret <4 x float> %tex  } @@ -477,8 +476,9 @@ END:  ;CHECK-DAG: store  define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {  main_body: -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 +  %tex0 = extractelement <4 x float> %tex, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    %dtex.1 = extractelement <4 x float> %dtex, i32 0    call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) @@ -523,8 +523,9 @@ IF:    br label %END  END: -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 +  %tex0 = extractelement <4 x float> %tex, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    ret <4 x float> %dtex  } @@ -545,7 +546,7 @@ END:  ;CHECK: image_sample  define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {  main_body: -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    %idx.0 = extractelement <2 x i32> %idx, i32 0    %data.0 = extractelement <2 x float> %data, i32 0    call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) @@ -555,8 +556,9 @@ main_body:    %idx.1 = extractelement <2 x i32> %idx, i32 1    %data.1 = extractelement <2 x float> %data, i32 1    call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) -  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 +  %tex2.0 = extractelement <4 x float> %tex2, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    %out = fadd <4 x float> %tex, %dtex    ret <4 x float> %out @@ -576,8 +578,9 @@ main_body:  ; CHECK: v_cmpx_  define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {  main_body: -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 +  %tex0 = extractelement <4 x float> %tex, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) @@ -618,7 +621,7 @@ main_body:  ; CHECK: ; return  define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {  entry: -  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0) +  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)    br label %loop  loop: @@ -628,7 +631,8 @@ loop:    br i1 %cc, label %break, label %body  body: -  %c.next = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c.iv, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %c.iv0 = extractelement <4 x float> %c.iv, i32 0 +  %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0    %ctr.next = fadd float %ctr.iv, 2.0    br label %loop @@ -669,7 +673,7 @@ entry:    %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx    %c = load i32, i32 addrspace(5)* %c.gep, align 4    %c.bc = bitcast i32 %c to float -  %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0    call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)    ret void @@ -687,8 +691,9 @@ entry:  ; CHECK: s_and_b64 exec, exec, [[LIVE]]  ; CHECK-NOT: exec  define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 +  %tex0 = extractelement <4 x float> %tex, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0    ret <4 x float> %dtex  } @@ -700,8 +705,9 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {  ; CHECK-NOT: exec  define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {  entry: -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0 +  %tex0 = extractelement <4 x float> %tex, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0    %cc = icmp sgt i32 %c, 0    br i1 %cc, label %if, label %else @@ -733,11 +739,11 @@ main_body:    br i1 %cc, label %if, label %else  if: -  %r.if = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float 0.0, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0    br label %end  else: -  %r.else = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0.0, float bitcast (i32 1 to float)>, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) #0    br label %end  end: @@ -757,8 +763,9 @@ end:  define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {  main_body:    %c.bc = bitcast i32 %c to float -  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 -  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 +  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 +  %tex0 = extractelement <4 x float> %tex, i32 0 +  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0    %cmp = icmp eq i32 %z, 0    br i1 %cmp, label %IF, label %ENDIF @@ -777,14 +784,13 @@ ENDIF:  }  declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 -declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1  declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2  declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 -declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3  declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3  declare void @llvm.AMDGPU.kill(float) #1  declare float @llvm.amdgcn.wqm.f32(float) #3  declare i32 @llvm.amdgcn.wqm.i32(i32) #3  | 

