diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-03-21 16:24:12 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-03-21 16:24:12 +0000 |
commit | 964a848514f034a83df4ceb9d50a9e60e3e91f23 (patch) | |
tree | 5770e2f771d1e0249a6adbab5063130cbe606118 | |
parent | dce313c3cf3948b057428ee97145d7641951afbb (diff) | |
download | bcm5719-llvm-964a848514f034a83df4ceb9d50a9e60e3e91f23.tar.gz bcm5719-llvm-964a848514f034a83df4ceb9d50a9e60e3e91f23.zip |
AMDGPU: Convert image intrinsic uses in tests
llvm-svn: 298386
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll | 28 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/commute-shifts.ll | 5 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/else.ll | 18 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll | 26 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/sgpr-copy.ll | 41 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/si-lod-bias.ll | 7 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/si-scheduler.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll | 98 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/split-smrd.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll | 43 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/wqm.ll | 105 |
14 files changed, 205 insertions, 208 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll b/llvm/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll index 24d0406b4c6..ef1b3d25f88 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll @@ -36,7 +36,8 @@ main_body: %tmp31 = insertelement <16 x i32> %tmp30, i32 undef, i32 6 %tmp32 = insertelement <16 x i32> %tmp31, i32 undef, i32 7 %tmp33 = insertelement <16 x i32> %tmp32, i32 undef, i32 8 - %tmp34 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp33, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %tmp33.bc = bitcast <16 x i32> %tmp33 to <16 x float> + %tmp34 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp33.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true) %tmp35 = extractelement <4 x float> %tmp34, i32 0 %tmp36 = bitcast float %tmp24 to i32 %tmp37 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp36, i32 1 @@ -47,7 +48,8 @@ main_body: %tmp42 = insertelement <16 x i32> %tmp41, i32 undef, i32 6 %tmp43 = insertelement <16 x i32> %tmp42, i32 undef, i32 7 %tmp44 = insertelement <16 x i32> %tmp43, i32 undef, i32 8 - %tmp45 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp44, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) + %tmp44.bc = bitcast <16 x i32> %tmp44 to <16 x float> + %tmp45 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp44.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true) %tmp46 = extractelement <4 x float> %tmp45, i32 0 %tmp47 = fmul float %tmp35, %tmp46 %tmp48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp47, 14 @@ -55,20 +57,10 @@ main_body: ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp49 } -; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #0 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0 - -attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll index c90af6a3573..84d8bf2bd70 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -7,7 +7,7 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { bb: %tmp = fptosi float %arg0 to i32 - %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp1 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false) %tmp2.f = extractelement <4 x float> %tmp1, i32 0 %tmp2 = bitcast float %tmp2.f to i32 %tmp3 = and i32 %tmp, 7 @@ -20,8 +20,9 @@ bb: ret float %tmp9 } -declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll index ef1e64763d4..22338e4f50e 100644 --- a/llvm/test/CodeGen/AMDGPU/else.ll +++ b/llvm/test/CodeGen/AMDGPU/else.ll @@ -1,12 +1,12 @@ -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}else_no_execfix: ; CHECK: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]], ; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]] ; CHECK-NEXT: ; mask branch -define amdgpu_ps float @else_no_execfix(i32 %z, float %v) { +define amdgpu_ps float @else_no_execfix(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -33,7 +33,7 @@ end: ; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]] ; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]] ; CHECK-NEXT: ; mask branch -define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) { +define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 br i1 %cc, label %if, label %else @@ -44,8 +44,7 @@ if: else: %c = fmul float %v, 3.0 - %c.i = bitcast float %c to i32 - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %v.else = extractelement <4 x float> %tex, i32 0 br label %end @@ -55,6 +54,9 @@ end: ret void } -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone +attributes #0 = { nounwind } +attributes #1 = { nounwind writeonly } +attributes #2 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll index fd1a463fd3e..f0af876567b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s ; CHECK-LABEL: {{^}}test1: ; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec @@ -7,7 +7,7 @@ ; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However, ; the expectation is that the intrinsic will be used in non-trivial shaders, ; so such an optimization doesn't seem worth the effort. -define amdgpu_ps float @test1() { +define amdgpu_ps float @test1() #0 { %live = call i1 @llvm.amdgcn.ps.live() %live.32 = zext i1 %live to i32 %r = bitcast i32 %live.32 to float @@ -19,12 +19,11 @@ define amdgpu_ps float @test1() { ; CHECK-DAG: s_wqm_b64 exec, exec ; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]] ; CHECK: image_sample v0, [[VAR]], -define amdgpu_ps float @test2() { +define amdgpu_ps float @test2() #0 { %live = call i1 @llvm.amdgcn.ps.live() %live.32 = zext i1 %live to i32 - - %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %live.32.bc = bitcast i32 %live.32 to float + %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %live.32.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %r = extractelement <4 x float> %t, i32 0 ret float %r } @@ -35,7 +34,7 @@ define amdgpu_ps float @test2() { ; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1 ; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]] ; CHECK: ; %dead -define amdgpu_ps float @test3(i32 %in) { +define amdgpu_ps float @test3(i32 %in) #0 { entry: %live = call i1 @llvm.amdgcn.ps.live() br i1 %live, label %end, label %dead @@ -46,14 +45,15 @@ dead: end: %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ] - %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %tc.bc = bitcast i32 %tc to float + %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %tc.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %r = extractelement <4 x float> %t, i32 0 ret float %r } -declare i1 @llvm.amdgcn.ps.live() #0 - -declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare i1 @llvm.amdgcn.ps.live() #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 -attributes #0 = { nounwind readnone } +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll index 59267f8b5c6..79bb5aff8f4 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll @@ -86,8 +86,9 @@ main_body: %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0 %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1 %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32> - %tmp49 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp48, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp50 = extractelement <4 x float> %tmp49, i32 2 + %a.bc.i = bitcast <2 x i32> %tmp48 to <2 x float> + %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp50 = extractelement <4 x float> %tmp1, i32 2 %tmp51 = call float @llvm.fabs.f32(float %tmp50) %tmp52 = fmul float %p2.i18, %p2.i18 %tmp53 = fmul float %p2.i12, %p2.i12 @@ -239,17 +240,17 @@ entry: br i1 %tmp27, label %if, label %else if: ; preds = %entry - %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 11, i32 13>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %val.if.0 = extractelement <4 x float> %val.if, i32 0 - %val.if.1 = extractelement <4 x float> %val.if, i32 1 - %val.if.2 = extractelement <4 x float> %val.if, i32 2 + %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0x36D6000000000000, float 0x36DA000000000000>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %val.if.0 = extractelement <4 x float> %tmp1, i32 0 + %val.if.1 = extractelement <4 x float> %tmp1, i32 1 + %val.if.2 = extractelement <4 x float> %tmp1, i32 2 br label %endif else: ; preds = %entry - %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 5, i32 7>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %val.else.0 = extractelement <4 x float> %val.else, i32 0 - %val.else.1 = extractelement <4 x float> %val.else, i32 1 - %val.else.2 = extractelement <4 x float> %val.else, i32 2 + %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0x36C4000000000000, float 0x36CC000000000000>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %val.else.0 = extractelement <4 x float> %tmp2, i32 0 + %val.else.1 = extractelement <4 x float> %tmp2, i32 1 + %val.else.2 = extractelement <4 x float> %tmp2, i32 2 br label %endif endif: ; preds = %else, %if @@ -356,7 +357,8 @@ bb38: ; preds = %bb %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0 %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1 %tmp56 = bitcast <8 x i32> %tmp26 to <8 x i32> - %tmp58 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp55, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %a.bc.i = bitcast <2 x i32> %tmp55 to <2 x float> + %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) br label %bb71 bb80: ; preds = %bb @@ -366,11 +368,12 @@ bb80: ; preds = %bb %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0 %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1 %tmp85 = bitcast <8 x i32> %tmp26 to <8 x i32> - %tmp87 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp84, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %a.bc.i1 = bitcast <2 x i32> %tmp84 to <2 x float> + %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) br label %bb71 bb71: ; preds = %bb80, %bb38 - %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ] + %tmp72 = phi <4 x float> [ %tmp2, %bb38 ], [ %tmp3, %bb80 ] %tmp88 = extractelement <4 x float> %tmp72, i32 0 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp88, float %tmp88, float %tmp88, float %tmp88, i1 true, i1 true) #0 ret void @@ -384,8 +387,8 @@ bb: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 - %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp10 = extractelement <4 x float> %tmp9, i32 0 + %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp10 = extractelement <4 x float> %tmp, i32 0 %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10) call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void @@ -399,8 +402,8 @@ bb: %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0 - %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp10 = extractelement <4 x float> %tmp9, i32 0 + %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp10 = extractelement <4 x float> %tmp, i32 0 %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void @@ -416,12 +419,12 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 - -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll b/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll index d64a0f3a166..3a7359ea4ff 100644 --- a/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll +++ b/llvm/test/CodeGen/AMDGPU/si-lod-bias.ll @@ -35,7 +35,8 @@ main_body: %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2 %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3 %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32> - %tmp35 = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> %tmp34, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp34.bc = bitcast <4 x i32> %tmp34 to <4 x float> + %tmp35 = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> %tmp34.bc, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp36 = extractelement <4 x float> %tmp35, i32 0 %tmp37 = extractelement <4 x float> %tmp35, i32 1 %tmp38 = extractelement <4 x float> %tmp35, i32 2 @@ -47,12 +48,12 @@ main_body: declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 - +declare <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll index 08f7972cbb1..462528c4ff1 100644 --- a/llvm/test/CodeGen/AMDGPU/si-scheduler.ll +++ b/llvm/test/CodeGen/AMDGPU/si-scheduler.ll @@ -40,7 +40,9 @@ main_body: %tmp30 = insertelement <2 x i32> %tmp29, i32 %tmp28, i32 1 %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32> %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32> - %tmp31 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp30, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp30.bc = bitcast <2 x i32> %tmp30 to <2 x float> + %tmp31 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp30.bc, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp32 = extractelement <4 x float> %tmp31, i32 0 %tmp33 = extractelement <4 x float> %tmp31, i32 1 %tmp34 = extractelement <4 x float> %tmp31, i32 2 @@ -54,12 +56,12 @@ main_body: declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 - -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll index bcaaba72403..8731e74d63a 100644 --- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -364,7 +364,8 @@ ENDIF: ; preds = %LOOP %tmp240 = insertelement <8 x i32> %tmp239, i32 %tmp238, i32 5 %tmp241 = insertelement <8 x i32> %tmp240, i32 undef, i32 6 %tmp242 = insertelement <8 x i32> %tmp241, i32 undef, i32 7 - %tmp243 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp242, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp242.bc = bitcast <8 x i32> %tmp242 to <8 x float> + %tmp243 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp242.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp244 = extractelement <4 x float> %tmp243, i32 3 %tmp245 = fcmp oge float %temp30.0, %tmp244 %tmp246 = sext i1 %tmp245 to i32 @@ -410,7 +411,8 @@ IF67: ; preds = %LOOP65 %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6 %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7 %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32> - %tmp277 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp276, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp276.bc = bitcast <8 x i32> %tmp276 to <8 x float> + %tmp277 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp276.bc, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp278 = extractelement <4 x float> %tmp277, i32 0 %tmp279 = extractelement <4 x float> %tmp277, i32 1 %tmp280 = extractelement <4 x float> %tmp277, i32 2 @@ -431,7 +433,8 @@ IF67: ; preds = %LOOP65 %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6 %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7 %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32> - %tmp297 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp296, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp296.bc = bitcast <8 x i32> %tmp296 to <8 x float> + %tmp297 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp296.bc, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp298 = extractelement <4 x float> %tmp297, i32 0 %tmp299 = extractelement <4 x float> %tmp297, i32 1 %tmp300 = extractelement <4 x float> %tmp297, i32 2 @@ -450,7 +453,8 @@ IF67: ; preds = %LOOP65 %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6 %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7 %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32> - %tmp315 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp314, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp314.bc = bitcast <8 x i32> %tmp314 to <8 x float> + %tmp315 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp314.bc, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp316 = extractelement <4 x float> %tmp315, i32 0 %tmp317 = extractelement <4 x float> %tmp315, i32 1 %tmp318 = extractelement <4 x float> %tmp315, i32 2 @@ -480,7 +484,8 @@ IF67: ; preds = %LOOP65 %tmp342 = insertelement <8 x i32> %tmp341, i32 %tmp336, i32 5 %tmp343 = insertelement <8 x i32> %tmp342, i32 undef, i32 6 %tmp344 = insertelement <8 x i32> %tmp343, i32 undef, i32 7 - %tmp345 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp344, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp344.bc = bitcast <8 x i32> %tmp344 to <8 x float> + %tmp345 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp344.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp346 = extractelement <4 x float> %tmp345, i32 0 %tmp347 = extractelement <4 x float> %tmp345, i32 1 %tmp348 = extractelement <4 x float> %tmp345, i32 2 @@ -511,7 +516,8 @@ IF67: ; preds = %LOOP65 %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6 %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7 %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32> - %tmp375 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp374, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp374.bc = bitcast <8 x i32> %tmp374 to <8 x float> + %tmp375 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp374.bc, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp376 = extractelement <4 x float> %tmp375, i32 0 %tmp377 = extractelement <4 x float> %tmp375, i32 1 %tmp378 = extractelement <4 x float> %tmp375, i32 2 @@ -566,7 +572,8 @@ IF67: ; preds = %LOOP65 %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6 %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7 %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32> - %tmp429 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp428, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp428.bc = bitcast <8 x i32> %tmp428 to <8 x float> + %tmp429 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp428.bc, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp430 = extractelement <4 x float> %tmp429, i32 0 %tmp431 = extractelement <4 x float> %tmp429, i32 1 %tmp432 = extractelement <4 x float> %tmp429, i32 2 @@ -618,7 +625,8 @@ IF67: ; preds = %LOOP65 %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2 %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3 %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32> - %tmp470 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp469, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp469.bc = bitcast <4 x i32> %tmp469 to <4 x float> + %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp469.bc, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %tmp471 = extractelement <4 x float> %tmp470, i32 0 %tmp472 = extractelement <4 x float> %tmp470, i32 1 %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -720,7 +728,8 @@ IF67: ; preds = %LOOP65 %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6 %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7 %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32> - %tmp571 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp570, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp570.bc = bitcast <8 x i32> %tmp570 to <8 x float> + %tmp571 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp570.bc, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp572 = extractelement <4 x float> %tmp571, i32 0 %tmp573 = extractelement <4 x float> %tmp571, i32 1 %tmp574 = extractelement <4 x float> %tmp571, i32 2 @@ -742,7 +751,8 @@ ENDIF66: ; preds = %LOOP65 %tmp588 = insertelement <8 x i32> %tmp587, i32 %tmp586, i32 5 %tmp589 = insertelement <8 x i32> %tmp588, i32 undef, i32 6 %tmp590 = insertelement <8 x i32> %tmp589, i32 undef, i32 7 - %tmp591 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp590, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp590.bc = bitcast <8 x i32> %tmp590 to <8 x float> + %tmp591 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp590.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp592 = extractelement <4 x float> %tmp591, i32 3 %tmp593 = fcmp oge float %temp30.1, %tmp592 %tmp594 = sext i1 %tmp593 to i32 @@ -768,7 +778,7 @@ ENDIF66: ; preds = %LOOP65 ; GCN-LABEL: {{^}}main1: ; GCN: s_endpgm ; TOVGPR: ScratchSize: 0{{$}} -define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { +define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -1135,7 +1145,8 @@ main_body: %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0 %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1 %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32> - %tmp225 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp224, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp224.bc = bitcast <2 x i32> %tmp224 to <2 x float> + %tmp225 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp224.bc, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp226 = extractelement <4 x float> %tmp225, i32 0 %tmp227 = extractelement <4 x float> %tmp225, i32 1 %tmp228 = extractelement <4 x float> %tmp225, i32 2 @@ -1208,7 +1219,8 @@ LOOP: ; preds = %LOOP, %main_body %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2 %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3 %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32> - %tmp282 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp281, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp281.bc = bitcast <4 x i32> %tmp281 to <4 x float> + %tmp282 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp281.bc, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp283 = extractelement <4 x float> %tmp282, i32 3 %tmp284 = fadd float %temp168.0, %tmp273 %tmp285 = fadd float %temp169.0, %tmp274 @@ -1272,11 +1284,12 @@ IF189: ; preds = %LOOP %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0 %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1 %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32> - %tmp342 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp341, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp343 = extractelement <4 x float> %tmp342, i32 0 - %tmp344 = extractelement <4 x float> %tmp342, i32 1 - %tmp345 = extractelement <4 x float> %tmp342, i32 2 - %tmp346 = extractelement <4 x float> %tmp342, i32 3 + %a.bc.i = bitcast <2 x i32> %tmp341 to <2 x float> + %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp343 = extractelement <4 x float> %tmp0, i32 0 + %tmp344 = extractelement <4 x float> %tmp0, i32 1 + %tmp345 = extractelement <4 x float> %tmp0, i32 2 + %tmp346 = extractelement <4 x float> %tmp0, i32 3 %tmp347 = fmul float %tmp343, %tmp22 %tmp348 = fmul float %tmp344, %tmp23 %tmp349 = fmul float %tmp345, %tmp24 @@ -1305,8 +1318,9 @@ IF189: ; preds = %LOOP %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0 %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1 %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32> - %tmp362 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp361, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp363 = extractelement <4 x float> %tmp362, i32 2 + %a.bc.i3 = bitcast <2 x i32> %tmp361 to <2 x float> + %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i3, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp363 = extractelement <4 x float> %tmp1, i32 2 %tmp364 = fmul float %result.i40, %result.i %tmp365 = fmul float %result.i36, %result.i44 %tmp366 = fmul float %result.i32, %result.i42 @@ -1316,11 +1330,12 @@ IF189: ; preds = %LOOP %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0 %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1 %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32> - %tmp372 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp371, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp373 = extractelement <4 x float> %tmp372, i32 0 - %tmp374 = extractelement <4 x float> %tmp372, i32 1 - %tmp375 = extractelement <4 x float> %tmp372, i32 2 - %tmp376 = extractelement <4 x float> %tmp372, i32 3 + %a.bc.i2 = bitcast <2 x i32> %tmp371 to <2 x float> + %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i2, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp373 = extractelement <4 x float> %tmp2, i32 0 + %tmp374 = extractelement <4 x float> %tmp2, i32 1 + %tmp375 = extractelement <4 x float> %tmp2, i32 2 + %tmp376 = extractelement <4 x float> %tmp2, i32 3 %tmp377 = fcmp olt float 0.000000e+00, %tmp375 %tmp378 = sext i1 %tmp377 to i32 %tmp379 = bitcast i32 %tmp378 to float @@ -1333,11 +1348,12 @@ IF189: ; preds = %LOOP %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0 %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1 %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32> - %tmp386 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp385, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tmp387 = extractelement <4 x float> %tmp386, i32 0 - %tmp388 = extractelement <4 x float> %tmp386, i32 1 - %tmp389 = extractelement <4 x float> %tmp386, i32 2 - %tmp390 = extractelement <4 x float> %tmp386, i32 3 + %a.bc.i1 = bitcast <2 x i32> %tmp385 to <2 x float> + %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp387 = extractelement <4 x float> %tmp3, i32 0 + %tmp388 = extractelement <4 x float> %tmp3, i32 1 + %tmp389 = extractelement <4 x float> %tmp3, i32 2 + %tmp390 = extractelement <4 x float> %tmp3, i32 3 %tmp391 = fcmp olt float 0.000000e+00, %tmp389 %tmp392 = sext i1 %tmp391 to i32 %tmp393 = bitcast i32 %tmp392 to float @@ -1431,7 +1447,8 @@ ENDIF197: ; preds = %IF198, %IF189 %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0 %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1 %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32> - %tmp470 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp469, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp469.bc = bitcast <2 x i32> %tmp469 to <2 x float> + %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp469.bc, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp471 = extractelement <4 x float> %tmp470, i32 0 %tmp472 = extractelement <4 x float> %tmp470, i32 1 %tmp473 = extractelement <4 x float> %tmp470, i32 2 @@ -1449,7 +1466,8 @@ ENDIF197: ; preds = %IF198, %IF189 %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0 %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1 %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32> - %tmp487 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp486, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp486.bc = bitcast <2 x i32> %tmp486 to <2 x float> + %tmp487 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp486.bc, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp488 = extractelement <4 x float> %tmp487, i32 0 %tmp489 = extractelement <4 x float> %tmp487, i32 1 %tmp490 = extractelement <4 x float> %tmp487, i32 2 @@ -1657,7 +1675,8 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2 %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3 %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32> - %tmp660 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp659, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp659.bc = bitcast <4 x i32> %tmp659 to <4 x float> + %tmp660 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp659.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp661 = extractelement <4 x float> %tmp660, i32 0 %tmp662 = extractelement <4 x float> %tmp660, i32 1 %tmp663 = bitcast float %tmp646 to i32 @@ -1667,7 +1686,8 @@ ENDIF209: ; preds = %ELSE214, %ELSE211, %tmp667 = insertelement <4 x i32> %tmp666, i32 %tmp664, i32 1 %tmp668 = insertelement <4 x i32> %tmp667, i32 %tmp665, i32 2 %tmp669 = insertelement <4 x i32> %tmp668, i32 undef, i32 3 - %tmp670 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp669, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp669.bc = bitcast <4 x i32> %tmp669 to <4 x float> + %tmp670 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp669.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp671 = extractelement <4 x float> %tmp670, i32 0 %tmp672 = extractelement <4 x float> %tmp670, i32 1 %tmp673 = fsub float -0.000000e+00, %tmp662 @@ -1845,15 +1865,15 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 - -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 60cee7a3499..3f53572ab44 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -357,7 +357,7 @@ bb7: ; preds = %bb4 ; CHECK: [[END]]: ; CHECK: s_or_b64 exec, exec ; CHECK: s_endpgm -define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x i32> %arg2) #0 { +define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x float> %arg2) #0 { bb: %tmp = fcmp ult float %arg1, 0.000000e+00 br i1 %tmp, label %bb3, label %bb4 @@ -367,7 +367,7 @@ bb3: ; preds = %bb br label %bb4 bb4: ; preds = %bb3, %bb - %tmp5 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp6 = extractelement <4 x float> %tmp5, i32 0 %tmp7 = fcmp une float %tmp6, 0.000000e+00 br i1 %tmp7, label %bb8, label %bb9 @@ -380,9 +380,8 @@ bb9: ; preds = %bb4 ret void } +declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 declare void @llvm.AMDGPU.kill(float) #0 -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/split-smrd.ll b/llvm/test/CodeGen/AMDGPU/split-smrd.ll index 09d1f5703c0..cdb1b1e3b50 100644 --- a/llvm/test/CodeGen/AMDGPU/split-smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/split-smrd.ll @@ -21,21 +21,21 @@ bb3: ; preds = %bb %tmp6 = sext i32 %tmp5 to i64 %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6 %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 - %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float bitcast (i32 1061158912 to float), float bitcast (i32 1048576000 to float)>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp10 = extractelement <4 x float> %tmp9, i32 0 %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef) call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0 ret void } -declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 - +declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 declare float @llvm.SI.load.const(<16 x i8>, i32) #1 -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } !0 = !{!1, !1, i64 0, i32 1} !1 = !{!"const", !2} diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll index 70a150a6bad..0ec45c68bb7 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL:{{^}}row_filter_C1_D0: -define void @row_filter_C1_D0() { +define void @row_filter_C1_D0() #0 { entry: br i1 undef, label %for.inc.1, label %do.body.preheader @@ -65,7 +65,7 @@ bb7: ; preds = %bb6 br label %bb4 bb9: ; preds = %bb2 - %tmp10 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp10 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp11 = extractelement <4 x float> %tmp10, i32 1 %tmp12 = extractelement <4 x float> %tmp10, i32 3 br label %bb14 @@ -95,8 +95,9 @@ bb27: ; preds = %bb24 br label %bb14 } + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll index 4902e9a3caf..f92a847b3e5 100644 --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -35,7 +35,8 @@ bb: %tmp1 = load volatile i32, i32 addrspace(1)* undef, align 4 %tmp2 = insertelement <4 x i32> undef, i32 %tmp1, i32 0 %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1 - %tmp4 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp3, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp3.cast = bitcast <4 x i32> %tmp3 to <4 x float> + %tmp4 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp3.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp5 = extractelement <4 x float> %tmp4, i32 0 %tmp6 = fmul float %tmp5, undef %tmp7 = fadd float %tmp6, %tmp6 @@ -83,8 +84,7 @@ define void @partially_undef_copy() #0 { ret void } -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare float @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1 attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll b/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll index 44d88ddc45d..853131baed5 100644 --- a/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/unigine-liveness-crash.ll @@ -17,7 +17,8 @@ main_body: %j.f.i = bitcast i32 %j.i to float %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 3, i32 4, i32 %arg6) #2 %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 3, i32 4, i32 %arg6) #2 - %tmp23 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp23 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) + %tmp24 = extractelement <4 x float> %tmp23, i32 3 %tmp25 = fmul float %tmp24, undef %tmp26 = fmul float undef, %p2.i @@ -26,14 +27,15 @@ main_body: %tmp29 = insertelement <4 x i32> undef, i32 %tmp28, i32 0 %tmp30 = insertelement <4 x i32> %tmp29, i32 0, i32 1 %tmp31 = insertelement <4 x i32> %tmp30, i32 undef, i32 2 - %tmp32 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp31, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp31.cast = bitcast <4 x i32> %tmp31 to <4 x float> + %tmp32 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp31.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp33 = extractelement <4 x float> %tmp32, i32 0 %tmp34 = fadd float undef, %tmp33 %tmp35 = fadd float %tmp34, undef %tmp36 = fadd float %tmp35, undef %tmp37 = fadd float %tmp36, undef %tmp38 = fadd float %tmp37, undef - %tmp39 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp39 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp40 = extractelement <4 x float> %tmp39, i32 0 %tmp41 = extractelement <4 x float> %tmp39, i32 1 %tmp42 = extractelement <4 x float> %tmp39, i32 2 @@ -50,7 +52,8 @@ main_body: %tmp53 = insertelement <4 x i32> undef, i32 %tmp50, i32 0 %tmp54 = insertelement <4 x i32> %tmp53, i32 %tmp51, i32 1 %tmp55 = insertelement <4 x i32> %tmp54, i32 %tmp52, i32 2 - %tmp56 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp55, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp55.cast = bitcast <4 x i32> %tmp55 to <4 x float> + %tmp56 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp55.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp57 = extractelement <4 x float> %tmp56, i32 0 %tmp58 = fadd float %tmp38, %tmp57 %tmp59 = fadd float undef, %tmp46 @@ -59,7 +62,8 @@ main_body: %tmp62 = bitcast float %tmp60 to i32 %tmp63 = insertelement <4 x i32> undef, i32 %tmp61, i32 1 %tmp64 = insertelement <4 x i32> %tmp63, i32 %tmp62, i32 2 - %tmp65 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp64, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp64.cast = bitcast <4 x i32> %tmp64 to <4 x float> + %tmp65 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp64.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp66 = extractelement <4 x float> %tmp65, i32 0 %tmp67 = fadd float %tmp58, %tmp66 %tmp68 = fmul float %tmp67, 1.250000e-01 @@ -99,33 +103,22 @@ IF29: ; preds = %LOOP ENDIF28: ; preds = %LOOP %tmp85 = insertelement <4 x i32> %tmp72, i32 undef, i32 1 %tmp86 = insertelement <4 x i32> %tmp85, i32 undef, i32 2 - %tmp87 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp86, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tmp86.cast = bitcast <4 x i32> %tmp86 to <4 x float> + %tmp87 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp86.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) %tmp88 = extractelement <4 x float> %tmp87, i32 0 %tmp89 = fadd float undef, %tmp88 br label %LOOP } -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 - -; Function Attrs: nounwind readnone declare float @llvm.minnum.f32(float, float) #1 - -; Function Attrs: nounwind readnone declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2 attributes #0 = { nounwind "InitialPSInputAddr"="36983" "target-cpu"="tonga" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind } +attributes #2 = { nounwind readonly } +attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index e06116ce734..9f277b2c9a5 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -18,9 +18,9 @@ main_body: ;CHECK-NEXT: ; %main_body ;CHECK-NEXT: s_wqm_b64 exec, exec ;CHECK-NOT: exec -define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { +define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) { main_body: - %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %c.2 = bitcast <4 x float> %c.1 to <4 x i32> %c.3 = extractelement <4 x i32> %c.2, i32 0 %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3 @@ -40,9 +40,9 @@ main_body: ;CHECK: store ;CHECK-NOT: exec ;CHECK: .size test3 -define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) { +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x float> %c) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %tex.1 = bitcast <4 x float> %tex to <4 x i32> %tex.2 = extractelement <4 x i32> %tex.1, i32 0 @@ -68,10 +68,9 @@ main_body: %c.1 = mul i32 %c, %d call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0) - - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.1.bc = bitcast i32 %c.1 to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %dtex } @@ -99,9 +98,9 @@ main_body: br i1 %cmp, label %IF, label %ELSE IF: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %data.if = extractelement <4 x float> %dtex, i32 0 br label %END @@ -141,9 +140,9 @@ main_body: br i1 %cmp, label %ELSE, label %IF IF: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.bc = bitcast i32 %c to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %data.if = extractelement <4 x float> %dtex, i32 0 br label %END @@ -198,7 +197,8 @@ ELSE: END: %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %coord.END.bc = bitcast i32 %coord.END to float + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %tex } @@ -213,13 +213,11 @@ END: ;CHECK: image_sample ;CHECK: v_cmp ;CHECK: store -define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) { +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %dtex.1 = extractelement <4 x float> %dtex, i32 0 - call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) %cc = fcmp ogt float %dtex.1, 0.0 @@ -252,7 +250,7 @@ END: ;CHECK: %END ;CHECK: image_sample ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) { +define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { main_body: %cond = icmp eq i32 %y, 0 br i1 %cond, label %IF, label %END @@ -263,9 +261,8 @@ IF: br label %END END: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %dtex } @@ -284,10 +281,9 @@ END: ;CHECK: buffer_store_dword ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: image_sample -define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %idx.0 = extractelement <2 x i32> %idx, i32 0 %data.0 = extractelement <2 x float> %data, i32 0 call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0) @@ -297,10 +293,8 @@ main_body: %idx.1 = extractelement <2 x i32> %idx, i32 1 %data.1 = extractelement <2 x float> %data, i32 1 call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0) - - %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %out = fadd <4 x float> %tex, %dtex ret <4 x float> %out @@ -318,11 +312,10 @@ main_body: ; CHECK: buffer_store_dword ; CHECK-NOT: wqm ; CHECK: v_cmpx_ -define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { +define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { main_body: - %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.1 = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) @@ -373,8 +366,7 @@ loop: br i1 %cc, label %break, label %body body: - %c.i = bitcast <4 x float> %c.iv to <4 x i32> - %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.next = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c.iv, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %ctr.next = fadd float %ctr.iv, 2.0 br label %loop @@ -414,9 +406,8 @@ entry: %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx %c = load i32, i32* %c.gep, align 4 - - %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %c.bc = bitcast i32 %c to float + %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0) ret void @@ -434,9 +425,8 @@ entry: ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { - %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.i = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 ret <4 x float> %dtex } @@ -448,10 +438,8 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { ; CHECK-NOT: exec define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { entry: - %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - %tex.i = bitcast <4 x float> %tex to <4 x i32> - %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) - + %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 + %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 %cc = icmp sgt i32 %c, 0 br i1 %cc, label %if, label %else @@ -483,33 +471,28 @@ main_body: br i1 %cc, label %if, label %else if: - %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r.if = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float 0.0, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 br label %end else: - %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 0, i32 1>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %r.else = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0.0, float bitcast (i32 1 to float)>, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0 br label %end end: %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] - call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) - ret <4 x float> %r } declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 - -declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2 - -declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 -declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 - +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 +declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3 declare void @llvm.AMDGPU.kill(float) #1 attributes #1 = { nounwind } |