diff options
author | Nicolai Haehnle <nhaehnle@gmail.com> | 2018-06-21 13:37:31 +0000 |
---|---|---|
committer | Nicolai Haehnle <nhaehnle@gmail.com> | 2018-06-21 13:37:31 +0000 |
commit | b29ee701229ba46c160e365234c214c2d59b7702 (patch) | |
tree | a10283a83061707c7c64cd5fb0d8fe8deebcb4e8 /llvm/lib | |
parent | 1045928aab8d33bcef57e004ebdd5482468922e6 (diff) | |
download | bcm5719-llvm-b29ee701229ba46c160e365234c214c2d59b7702.tar.gz bcm5719-llvm-b29ee701229ba46c160e365234c214c2d59b7702.zip |
InstCombine/AMDGPU: Add dimension-aware image intrinsics to SimplifyDemanded
Summary:
Use the expanded features of the TableGen generic tables to avoid manually
adding the combinatorially exploded set of intrinsics. The
getAMDGPUImageDimIntrinsic lookup function is early-out,
i.e. non-AMDGPU intrinsics will never look at the underlying table.
Use a generic approach for getting the new intrinsic overload to keep the
code simple, and make the image dmask handling more generic:
- handle non-sampler image loads
- handle the case where the set of demanded elements is not a prefix
There is some overlap between this code and an optimization that happens
in the backend during code generation. They currently complement each other:
- only the codegen optimization can generate vec3 loads
- only the InstCombine optimization can handle D16
The InstCombine optimization also likely covers more cases since the
codegen optimization is fairly ad-hoc. Ideally, we'll remove the optimization
in codegen once the infrastructure for vec3 is in place (which will probably
take a long time).
Modify the test cases to use dimension-aware intrinsics. This makes it
easier to see that the test coverage for the new intrinsics is equivalent,
and the old style intrinsics will be removed in a follow-up commit anyway.
Change-Id: I4b91ea661413d13004956fe4ef7d13d41b8ce3ad
Reviewers: arsenm, rampitec, majnemer
Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, t-tye, llvm-commits
Differential Revision: https://reviews.llvm.org/D48165
llvm-svn: 335230
Diffstat (limited to 'llvm/lib')
4 files changed, 141 insertions, 71 deletions
diff --git a/llvm/lib/Transforms/InstCombine/CMakeLists.txt b/llvm/lib/Transforms/InstCombine/CMakeLists.txt index 5cbe804ce3e..8a3a58e9ecc 100644 --- a/llvm/lib/Transforms/InstCombine/CMakeLists.txt +++ b/llvm/lib/Transforms/InstCombine/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_TARGET_DEFINITIONS InstCombineTables.td) +tablegen(LLVM InstCombineTables.inc -gen-searchable-tables) +add_public_tablegen_target(InstCombineTableGen) + add_llvm_library(LLVMInstCombine InstructionCombining.cpp InstCombineAddSub.cpp diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index ba99035f764..23888526aae 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -706,6 +706,10 @@ private: /// demanded bits. bool SimplifyDemandedInstructionBits(Instruction &Inst); + Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, + APInt DemandedElts, + int DmaskIdx = -1); + Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, unsigned Depth = 0); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 8190c342c29..133561b30b4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -23,6 +23,17 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "instcombine" +namespace { + +struct AMDGPUImageDMaskIntrinsic { + unsigned Intr; +}; + +#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL +#include "InstCombineTables.inc" + +} // end anonymous namespace + /// Check to see if the specified operand of the specified instruction is a /// constant integer. If so, check to see if there are any bits set in the /// constant that are not demanded. If so, shrink the constant and return true. @@ -909,6 +920,110 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, return nullptr; } +/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. +Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, + APInt DemandedElts, + int DMaskIdx) { + unsigned VWidth = II->getType()->getVectorNumElements(); + if (VWidth == 1) + return nullptr; + + ConstantInt *NewDMask = nullptr; + + if (DMaskIdx < 0) { + // Pretend that a prefix of elements is demanded to simplify the code + // below. + DemandedElts = (1 << DemandedElts.getActiveBits()) - 1; + } else { + ConstantInt *DMask = dyn_cast<ConstantInt>(II->getArgOperand(DMaskIdx)); + if (!DMask) + return nullptr; // non-constant dmask is not supported by codegen + + unsigned DMaskVal = DMask->getZExtValue() & 0xf; + + // Mask off values that are undefined because the dmask doesn't cover them + DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; + + unsigned NewDMaskVal = 0; + unsigned OrigLoadIdx = 0; + for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { + const unsigned Bit = 1 << SrcIdx; + if (!!(DMaskVal & Bit)) { + if (!!(DemandedElts & (1 << OrigLoadIdx))) + NewDMaskVal |= Bit; + OrigLoadIdx++; + } + } + + if (DMaskVal != NewDMaskVal) + NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal); + } + + // TODO: Handle 3 vectors when supported in code gen. + unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countPopulation()); + if (!NewNumElts) + return UndefValue::get(II->getType()); + + if (NewNumElts >= VWidth && DemandedElts.isMask()) { + if (NewDMask) + II->setArgOperand(DMaskIdx, NewDMask); + return nullptr; + } + + // Determine the overload types of the original intrinsic. + auto IID = II->getIntrinsicID(); + SmallVector<Intrinsic::IITDescriptor, 16> Table; + getIntrinsicInfoTableEntries(IID, Table); + ArrayRef<Intrinsic::IITDescriptor> TableRef = Table; + + FunctionType *FTy = II->getCalledFunction()->getFunctionType(); + SmallVector<Type *, 6> OverloadTys; + Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, OverloadTys); + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) + Intrinsic::matchIntrinsicType(FTy->getParamType(i), TableRef, OverloadTys); + + // Get the new return type overload of the intrinsic. + Module *M = II->getParent()->getParent()->getParent(); + Type *EltTy = II->getType()->getVectorElementType(); + Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts); + + OverloadTys[0] = NewTy; + Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys); + + SmallVector<Value *, 16> Args; + for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) + Args.push_back(II->getArgOperand(I)); + + if (NewDMask) + Args[DMaskIdx] = NewDMask; + + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(II); + + CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); + NewCall->takeName(II); + NewCall->copyMetadata(*II); + + if (NewNumElts == 1) { + return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall, + DemandedElts.countTrailingZeros()); + } + + SmallVector<uint32_t, 8> EltMask; + unsigned NewLoadIdx = 0; + for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { + if (!!(DemandedElts & (1 << OrigLoadIdx))) + EltMask.push_back(NewLoadIdx++); + else + EltMask.push_back(NewNumElts); + } + + Value *Shuffle = + Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask); + + return Shuffle; +} + /// The specified value produces a vector with any number of elements. /// DemandedElts contains the set of elements that are actually used by the /// caller. This method analyzes which elements of the operand are undef and @@ -1267,8 +1382,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); if (!II) break; switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::x86_xop_vfrcz_ss: case Intrinsic::x86_xop_vfrcz_sd: // The instructions for these intrinsics are speced to zero upper bits not @@ -1582,79 +1695,17 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Intrinsic::amdgcn_image_sample_c_cd_cl_o: case Intrinsic::amdgcn_image_getlod: { - if (VWidth == 1 || !DemandedElts.isMask()) - return nullptr; - - // TODO: Handle 3 vectors when supported in code gen. - unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes()); - if (NewNumElts == VWidth) - return nullptr; - - Module *M = II->getParent()->getParent()->getParent(); - Type *EltTy = V->getType()->getVectorElementType(); - - Type *NewTy = (NewNumElts == 1) ? EltTy : - VectorType::get(EltTy, NewNumElts); - auto IID = II->getIntrinsicID(); - bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load || IID == Intrinsic::amdgcn_buffer_load_format; + return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, + IsBuffer ? -1 : 3); + } + default: { + if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID())) + return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0); - Function *NewIntrin = IsBuffer ? - Intrinsic::getDeclaration(M, IID, NewTy) : - // Samplers have 3 mangled types. - Intrinsic::getDeclaration(M, IID, - { NewTy, II->getArgOperand(0)->getType(), - II->getArgOperand(1)->getType()}); - - SmallVector<Value *, 5> Args; - for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) - Args.push_back(II->getArgOperand(I)); - - IRBuilderBase::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(II); - - CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); - NewCall->takeName(II); - NewCall->copyMetadata(*II); - - if (!IsBuffer) { - ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3)); - if (DMask) { - unsigned DMaskVal = DMask->getZExtValue() & 0xf; - - unsigned PopCnt = 0; - unsigned NewDMask = 0; - for (unsigned I = 0; I < 4; ++I) { - const unsigned Bit = 1 << I; - if (!!(DMaskVal & Bit)) { - if (++PopCnt > NewNumElts) - break; - - NewDMask |= Bit; - } - } - - NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask)); - } - } - - - if (NewNumElts == 1) { - return Builder.CreateInsertElement(UndefValue::get(V->getType()), - NewCall, static_cast<uint64_t>(0)); - } - - SmallVector<uint32_t, 8> EltMask; - for (unsigned I = 0; I < VWidth; ++I) - EltMask.push_back(I); - - Value *Shuffle = Builder.CreateShuffleVector( - NewCall, UndefValue::get(NewTy), EltMask); - - MadeChange = true; - return Shuffle; + break; } } break; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineTables.td b/llvm/lib/Transforms/InstCombine/InstCombineTables.td new file mode 100644 index 00000000000..98b2adc442f --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineTables.td @@ -0,0 +1,11 @@ +include "llvm/TableGen/SearchableTable.td" +include "llvm/IR/Intrinsics.td" + +def AMDGPUImageDMaskIntrinsicTable : GenericTable { + let FilterClass = "AMDGPUImageDMaskIntrinsic"; + let Fields = ["Intr"]; + + let PrimaryKey = ["Intr"]; + let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic"; + let PrimaryKeyEarlyOut = 1; +} |