From 65f94b33808d7d69539961a6f5a2168f0a1eef41 Mon Sep 17 00:00:00 2001 From: Piotr Sobczak Date: Fri, 15 Nov 2019 15:07:41 +0100 Subject: [InstCombine][AMDGPU] Trim more components of *buffer_load Summary: Add trimming of unused components of s_buffer_load. Extend trimming of *buffer_load to also include unused components at the beginning of vectors and update offset. Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D70315 --- .../InstCombine/InstCombineSimplifyDemanded.cpp | 77 +++++++++++++++++----- 1 file changed, 60 insertions(+), 17 deletions(-) (limited to 'llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp') diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 359fc55035f..4e509816716 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -984,13 +984,65 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, if (VWidth == 1) return nullptr; - ConstantInt *NewDMask = nullptr; + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(II); + + // Assume the arguments are unchanged and later override them, if needed. + SmallVector Args(II->arg_begin(), II->arg_end()); if (DMaskIdx < 0) { - // Pretend that a prefix of elements is demanded to simplify the code - // below. - DemandedElts = (1 << DemandedElts.getActiveBits()) - 1; + // Buffer case. + + const unsigned ActiveBits = DemandedElts.getActiveBits(); + const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); + + // Start assuming the prefix of elements is demanded, but possibly clear some other bits if + // there are trailing zeros (unused components at front) and update offset. + DemandedElts = (1 << ActiveBits) - 1; + + if (UnusedComponentsAtFront > 0) { + static const unsigned InvalidOffsetIdx = 0xf; + + unsigned OffsetIdx; + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + OffsetIdx = 1; + break; + case Intrinsic::amdgcn_s_buffer_load: + // If resulting type is vec3, there is no point in trimming the + // load with updated offset, as the vec3 would most likely be widened to + // vec4 anyway during lowering. + if (ActiveBits == 4 && UnusedComponentsAtFront == 1) + OffsetIdx = InvalidOffsetIdx; + else + OffsetIdx = 1; + break; + case Intrinsic::amdgcn_buffer_load: + case Intrinsic::amdgcn_buffer_load_format: + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load_format: + OffsetIdx = 2; + break; + default: + // TODO: handle *tbuffer* intrinsics. + OffsetIdx = InvalidOffsetIdx; + break; + } + + if (OffsetIdx != InvalidOffsetIdx) { + // Clear demanded bits and update the offset. + DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); + auto Offset = II->getArgOperand(OffsetIdx); + unsigned SingleComponentSizeInBits = getDataLayout().getTypeSizeInBits(II->getType()->getScalarType()); + unsigned OffsetAdd = UnusedComponentsAtFront * SingleComponentSizeInBits / 8; + auto OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); + Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal); + } + } } else { + // Image case. + ConstantInt *DMask = cast(II->getArgOperand(DMaskIdx)); unsigned DMaskVal = DMask->getZExtValue() & 0xf; @@ -1009,7 +1061,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, } if (DMaskVal != NewDMaskVal) - NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal); + Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); } unsigned NewNumElts = DemandedElts.countPopulation(); @@ -1017,8 +1069,8 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, return UndefValue::get(II->getType()); if (NewNumElts >= VWidth && DemandedElts.isMask()) { - if (NewDMask) - II->setArgOperand(DMaskIdx, NewDMask); + if (DMaskIdx >= 0) + II->setArgOperand(DMaskIdx, Args[DMaskIdx]); return nullptr; } @@ -1041,16 +1093,6 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, OverloadTys[0] = NewTy; Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys); - SmallVector Args; - for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) - Args.push_back(II->getArgOperand(I)); - - if (NewDMask) - Args[DMaskIdx] = NewDMask; - - IRBuilderBase::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(II); - CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); NewCall->takeName(II); NewCall->copyMetadata(*II); @@ -1719,6 +1761,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_s_buffer_load: case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: case Intrinsic::amdgcn_struct_tbuffer_load: -- cgit v1.2.3