summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPiotr Sobczak <piotr.sobczak@amd.com>2019-10-16 11:14:01 +0000
committerPiotr Sobczak <piotr.sobczak@amd.com>2019-10-16 11:14:01 +0000
commit79769a4475b37fa011203c688985e36b972328ff (patch)
tree61e4e4ae5312b3bf27160595a33ad663018f2c7c
parent5a131889665fadca1306e75cd463f0da320d6717 (diff)
downloadbcm5719-llvm-79769a4475b37fa011203c688985e36b972328ff.tar.gz
bcm5719-llvm-79769a4475b37fa011203c688985e36b972328ff.zip
[InstCombine][AMDGPU] Fix crash with v3i16/v3f16 buffer intrinsics
Summary: This is something of a workaround to avoid a crash later on in type legalizer (WidenVectorResult()). Also added some f16 tests, including a non-working v3f16 case with a FIXME. Reviewers: arsenm, tpr, nhaehnle Reviewed By: arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68865 llvm-svn: 374993
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp7
-rw-r--r--llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll45
2 files changed, 52 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index ee42c730b09..4680cb30060 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -971,6 +971,13 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
APInt DemandedElts,
int DMaskIdx) {
+
+ // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
+ if (DMaskIdx < 0 &&
+ II->getType()->getScalarSizeInBits() != 32 &&
+ DemandedElts.getActiveBits() == 3)
+ return nullptr;
+
unsigned VWidth = II->getType()->getVectorNumElements();
if (VWidth == 1)
return nullptr;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index 906f2aff93d..db7533a3723 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -1474,6 +1474,51 @@ declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32
declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1
+; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16(
+; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 3
+; CHECK-NEXT: ret half %elt1
+define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+ %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+ %elt1 = extractelement <4 x half> %data, i32 3
+ ret half %elt1
+}
+
+; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16).
+; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16(
+; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 2
+; CHECK-NEXT: ret half %elt1
+define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+ %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+ %elt1 = extractelement <4 x half> %data, i32 2
+ ret half %elt1
+}
+
+; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16(
+; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <2 x half> %data, i32 1
+; CHECK-NEXT: ret half %elt1
+define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+ %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+ %elt1 = extractelement <4 x half> %data, i32 1
+ ret half %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
+ %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+ %elt1 = extractelement <4 x half> %data, i32 0
+ ret half %elt1
+}
+
+declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1
+declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1
+declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1
+declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1
+
; --------------------------------------------------------------------
; llvm.amdgcn.struct.tbuffer.load
; --------------------------------------------------------------------
OpenPOWER on IntegriCloud