summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-m0.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.ll64
2 files changed, 58 insertions, 14 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 1147464c1dd..a7b522165ab 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -66,13 +66,13 @@ endif:
; TOSMEM-NOT: s_m0
; TOSMEM: s_add_u32 m0, s7, 0x100
-; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
+; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
; FIXME-TOSMEM-NOT: m0
; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s7, 0x200
-; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
+; TOSMEM: s_add_u32 m0, s7, 0x300
+; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
; FIXME-TOSMEM-NOT: m0
; TOSMEM: s_mov_b64 exec,
@@ -80,7 +80,7 @@ endif:
; TOSMEM: s_branch
; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM: s_add_u32 m0, s7, 0x200
+; TOSMEM: s_add_u32 m0, s7, 0x400
; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 12fb6cd4d6f..e403ad128a0 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -12,24 +12,33 @@ main_body:
ret <4 x float> %tex
}
-; Check that WQM is triggered by image samples and left untouched for loads...
+; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
;
;CHECK-LABEL: {{^}}test2:
;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: interp
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK-NOT: interp
+;CHECK: image_sample
;CHECK-NOT: exec
-define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) {
+;CHECK: .size test2
+define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
main_body:
- %c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
- %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
- %c.3 = extractelement <4 x i32> %c.2, i32 0
- %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
- %data = load float, float addrspace(1)* %gep
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1
- ret void
+ %inst23 = extractelement <2 x float> %pos, i32 0
+ %inst24 = extractelement <2 x float> %pos, i32 1
+ %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
+ %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
+ %inst27 = insertelement <2 x float> undef, float %inst26, i32 0
+ %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
+ %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
+ %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1
+ %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+ ret <4 x float> %tex
}
-; ... but disabled for stores (and, in this simple case, not re-enabled).
+; ... but disabled for stores (and, in this simple case, not re-enabled) ...
;
;CHECK-LABEL: {{^}}test3:
;CHECK-NEXT: ; %main_body
@@ -51,6 +60,36 @@ main_body:
ret <4 x float> %tex
}
+; ... and disabled for export.
+;
+;CHECK-LABEL: {{^}}test3x:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: image_sample
+;CHECK: exp
+;CHECK-NOT: exec
+;CHECK: .size test3x
+define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
+main_body:
+ %inst23 = extractelement <2 x float> %pos, i32 0
+ %inst24 = extractelement <2 x float> %pos, i32 1
+ %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
+ %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
+ %inst27 = insertelement <2 x float> undef, float %inst26, i32 0
+ %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
+ %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
+ %inst30 = insertelement <2 x float> %inst27, float %inst29, i32 1
+ %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %inst30, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+ %tex.0 = extractelement <4 x float> %tex, i32 0
+ %tex.1 = extractelement <4 x float> %tex, i32 1
+ %tex.2 = extractelement <4 x float> %tex, i32 2
+ %tex.3 = extractelement <4 x float> %tex, i32 3
+ call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
+ ret void
+}
+
; Check that WQM is re-enabled when required.
;
;CHECK-LABEL: {{^}}test4:
@@ -724,9 +763,14 @@ declare i32 @llvm.amdgcn.wwm.i32(i32) #3
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }
attributes #3 = { nounwind readnone }
attributes #4 = { nounwind readnone convergent }
attributes #5 = { "amdgpu-ps-wqm-outputs" }
+attributes #6 = { nounwind "InitialPSInputAddr"="2" }
OpenPOWER on IntegriCloud