diff options
| author | Piotr Sobczak <piotr.sobczak@amd.com> | 2019-10-16 11:14:01 +0000 |
|---|---|---|
| committer | Piotr Sobczak <piotr.sobczak@amd.com> | 2019-10-16 11:14:01 +0000 |
| commit | 79769a4475b37fa011203c688985e36b972328ff (patch) | |
| tree | 61e4e4ae5312b3bf27160595a33ad663018f2c7c /llvm | |
| parent | 5a131889665fadca1306e75cd463f0da320d6717 (diff) | |
| download | bcm5719-llvm-79769a4475b37fa011203c688985e36b972328ff.tar.gz bcm5719-llvm-79769a4475b37fa011203c688985e36b972328ff.zip | |
[InstCombine][AMDGPU] Fix crash with v3i16/v3f16 buffer intrinsics
Summary:
This is something of a workaround to avoid a crash later on in type
legalizer (WidenVectorResult()).
Also added some f16 tests, including a non-working v3f16 case with
a FIXME.
Reviewers: arsenm, tpr, nhaehnle
Reviewed By: arsenm
Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D68865
llvm-svn: 374993
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 7 | ||||
| -rw-r--r-- | llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll | 45 |
2 files changed, 52 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index ee42c730b09..4680cb30060 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -971,6 +971,13 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, APInt DemandedElts, int DMaskIdx) { + + // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported. + if (DMaskIdx < 0 && + II->getType()->getScalarSizeInBits() != 32 && + DemandedElts.getActiveBits() == 3) + return nullptr; + unsigned VWidth = II->getType()->getVectorNumElements(); if (VWidth == 1) return nullptr; diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll index 906f2aff93d..db7533a3723 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll @@ -1474,6 +1474,51 @@ declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32 declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1 +; CHECK-LABEL: @extract_elt3_raw_tbuffer_load_v4f16( +; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) +; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 3 +; CHECK-NEXT: ret half %elt1 +define amdgpu_ps half @extract_elt3_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %elt1 = extractelement <4 x half> %data, i32 3 + ret half %elt1 +} + +; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16). +; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16( +; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) +; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 2 +; CHECK-NEXT: ret half %elt1 +define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %elt1 = extractelement <4 x half> %data, i32 2 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt1_raw_tbuffer_load_v4f16( +; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) +; CHECK-NEXT: %elt1 = extractelement <2 x half> %data, i32 1 +; CHECK-NEXT: ret half %elt1 +define amdgpu_ps half @extract_elt1_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %elt1 = extractelement <4 x half> %data, i32 1 + ret half %elt1 +} + +; CHECK-LABEL: @extract_elt0_raw_tbuffer_load_v4f16( +; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) +; CHECK-NEXT: ret half %data +define amdgpu_ps half @extract_elt0_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 { + %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0) + %elt1 = extractelement <4 x half> %data, i32 0 + ret half %elt1 +} + +declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1 +declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1 +declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1 +declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1 + ; -------------------------------------------------------------------- ; llvm.amdgcn.struct.tbuffer.load ; -------------------------------------------------------------------- |

