From b29ee701229ba46c160e365234c214c2d59b7702 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Thu, 21 Jun 2018 13:37:31 +0000 Subject: InstCombine/AMDGPU: Add dimension-aware image intrinsics to SimplifyDemanded Summary: Use the expanded features of the TableGen generic tables to avoid manually adding the combinatorially exploded set of intrinsics. The getAMDGPUImageDimIntrinsic lookup function is early-out, i.e. non-AMDGPU intrinsics will never look at the underlying table. Use a generic approach for getting the new intrinsic overload to keep the code simple, and make the image dmask handling more generic: - handle non-sampler image loads - handle the case where the set of demanded elements is not a prefix There is some overlap between this code and an optimization that happens in the backend during code generation. They currently complement each other: - only the codegen optimization can generate vec3 loads - only the InstCombine optimization can handle D16 The InstCombine optimization also likely covers more cases since the codegen optimization is fairly ad-hoc. Ideally, we'll remove the optimization in codegen once the infrastructure for vec3 is in place (which will probably take a long time). Modify the test cases to use dimension-aware intrinsics. This makes it easier to see that the test coverage for the new intrinsics is equivalent, and the old style intrinsics will be removed in a follow-up commit anyway. Change-Id: I4b91ea661413d13004956fe4ef7d13d41b8ce3ad Reviewers: arsenm, rampitec, majnemer Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D48165 llvm-svn: 335230 --- llvm/lib/Transforms/InstCombine/CMakeLists.txt | 4 + .../Transforms/InstCombine/InstCombineInternal.h | 4 + .../InstCombine/InstCombineSimplifyDemanded.cpp | 193 +++++++++++++-------- .../Transforms/InstCombine/InstCombineTables.td | 11 ++ 4 files changed, 141 insertions(+), 71 deletions(-) create mode 100644 llvm/lib/Transforms/InstCombine/InstCombineTables.td (limited to 'llvm/lib/Transforms/InstCombine') diff --git a/llvm/lib/Transforms/InstCombine/CMakeLists.txt b/llvm/lib/Transforms/InstCombine/CMakeLists.txt index 5cbe804ce3e..8a3a58e9ecc 100644 --- a/llvm/lib/Transforms/InstCombine/CMakeLists.txt +++ b/llvm/lib/Transforms/InstCombine/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_TARGET_DEFINITIONS InstCombineTables.td) +tablegen(LLVM InstCombineTables.inc -gen-searchable-tables) +add_public_tablegen_target(InstCombineTableGen) + add_llvm_library(LLVMInstCombine InstructionCombining.cpp InstCombineAddSub.cpp diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index ba99035f764..23888526aae 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -706,6 +706,10 @@ private: /// demanded bits. bool SimplifyDemandedInstructionBits(Instruction &Inst); + Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, + APInt DemandedElts, + int DmaskIdx = -1); + Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, unsigned Depth = 0); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 8190c342c29..133561b30b4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -23,6 +23,17 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "instcombine" +namespace { + +struct AMDGPUImageDMaskIntrinsic { + unsigned Intr; +}; + +#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL +#include "InstCombineTables.inc" + +} // end anonymous namespace + /// Check to see if the specified operand of the specified instruction is a /// constant integer. If so, check to see if there are any bits set in the /// constant that are not demanded. If so, shrink the constant and return true. @@ -909,6 +920,110 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, return nullptr; } +/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. +Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, + APInt DemandedElts, + int DMaskIdx) { + unsigned VWidth = II->getType()->getVectorNumElements(); + if (VWidth == 1) + return nullptr; + + ConstantInt *NewDMask = nullptr; + + if (DMaskIdx < 0) { + // Pretend that a prefix of elements is demanded to simplify the code + // below. + DemandedElts = (1 << DemandedElts.getActiveBits()) - 1; + } else { + ConstantInt *DMask = dyn_cast(II->getArgOperand(DMaskIdx)); + if (!DMask) + return nullptr; // non-constant dmask is not supported by codegen + + unsigned DMaskVal = DMask->getZExtValue() & 0xf; + + // Mask off values that are undefined because the dmask doesn't cover them + DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; + + unsigned NewDMaskVal = 0; + unsigned OrigLoadIdx = 0; + for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { + const unsigned Bit = 1 << SrcIdx; + if (!!(DMaskVal & Bit)) { + if (!!(DemandedElts & (1 << OrigLoadIdx))) + NewDMaskVal |= Bit; + OrigLoadIdx++; + } + } + + if (DMaskVal != NewDMaskVal) + NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal); + } + + // TODO: Handle 3 vectors when supported in code gen. + unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countPopulation()); + if (!NewNumElts) + return UndefValue::get(II->getType()); + + if (NewNumElts >= VWidth && DemandedElts.isMask()) { + if (NewDMask) + II->setArgOperand(DMaskIdx, NewDMask); + return nullptr; + } + + // Determine the overload types of the original intrinsic. + auto IID = II->getIntrinsicID(); + SmallVector Table; + getIntrinsicInfoTableEntries(IID, Table); + ArrayRef TableRef = Table; + + FunctionType *FTy = II->getCalledFunction()->getFunctionType(); + SmallVector OverloadTys; + Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, OverloadTys); + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) + Intrinsic::matchIntrinsicType(FTy->getParamType(i), TableRef, OverloadTys); + + // Get the new return type overload of the intrinsic. + Module *M = II->getParent()->getParent()->getParent(); + Type *EltTy = II->getType()->getVectorElementType(); + Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts); + + OverloadTys[0] = NewTy; + Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys); + + SmallVector Args; + for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) + Args.push_back(II->getArgOperand(I)); + + if (NewDMask) + Args[DMaskIdx] = NewDMask; + + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(II); + + CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); + NewCall->takeName(II); + NewCall->copyMetadata(*II); + + if (NewNumElts == 1) { + return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall, + DemandedElts.countTrailingZeros()); + } + + SmallVector EltMask; + unsigned NewLoadIdx = 0; + for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { + if (!!(DemandedElts & (1 << OrigLoadIdx))) + EltMask.push_back(NewLoadIdx++); + else + EltMask.push_back(NewNumElts); + } + + Value *Shuffle = + Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask); + + return Shuffle; +} + /// The specified value produces a vector with any number of elements. /// DemandedElts contains the set of elements that are actually used by the /// caller. This method analyzes which elements of the operand are undef and @@ -1267,8 +1382,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, IntrinsicInst *II = dyn_cast(I); if (!II) break; switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::x86_xop_vfrcz_ss: case Intrinsic::x86_xop_vfrcz_sd: // The instructions for these intrinsics are speced to zero upper bits not @@ -1582,79 +1695,17 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Intrinsic::amdgcn_image_sample_c_cd_cl_o: case Intrinsic::amdgcn_image_getlod: { - if (VWidth == 1 || !DemandedElts.isMask()) - return nullptr; - - // TODO: Handle 3 vectors when supported in code gen. - unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes()); - if (NewNumElts == VWidth) - return nullptr; - - Module *M = II->getParent()->getParent()->getParent(); - Type *EltTy = V->getType()->getVectorElementType(); - - Type *NewTy = (NewNumElts == 1) ? EltTy : - VectorType::get(EltTy, NewNumElts); - auto IID = II->getIntrinsicID(); - bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load || IID == Intrinsic::amdgcn_buffer_load_format; + return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, + IsBuffer ? -1 : 3); + } + default: { + if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID())) + return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0); - Function *NewIntrin = IsBuffer ? - Intrinsic::getDeclaration(M, IID, NewTy) : - // Samplers have 3 mangled types. - Intrinsic::getDeclaration(M, IID, - { NewTy, II->getArgOperand(0)->getType(), - II->getArgOperand(1)->getType()}); - - SmallVector Args; - for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) - Args.push_back(II->getArgOperand(I)); - - IRBuilderBase::InsertPointGuard Guard(Builder); - Builder.SetInsertPoint(II); - - CallInst *NewCall = Builder.CreateCall(NewIntrin, Args); - NewCall->takeName(II); - NewCall->copyMetadata(*II); - - if (!IsBuffer) { - ConstantInt *DMask = dyn_cast(NewCall->getArgOperand(3)); - if (DMask) { - unsigned DMaskVal = DMask->getZExtValue() & 0xf; - - unsigned PopCnt = 0; - unsigned NewDMask = 0; - for (unsigned I = 0; I < 4; ++I) { - const unsigned Bit = 1 << I; - if (!!(DMaskVal & Bit)) { - if (++PopCnt > NewNumElts) - break; - - NewDMask |= Bit; - } - } - - NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask)); - } - } - - - if (NewNumElts == 1) { - return Builder.CreateInsertElement(UndefValue::get(V->getType()), - NewCall, static_cast(0)); - } - - SmallVector EltMask; - for (unsigned I = 0; I < VWidth; ++I) - EltMask.push_back(I); - - Value *Shuffle = Builder.CreateShuffleVector( - NewCall, UndefValue::get(NewTy), EltMask); - - MadeChange = true; - return Shuffle; + break; } } break; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineTables.td b/llvm/lib/Transforms/InstCombine/InstCombineTables.td new file mode 100644 index 00000000000..98b2adc442f --- /dev/null +++ b/llvm/lib/Transforms/InstCombine/InstCombineTables.td @@ -0,0 +1,11 @@ +include "llvm/TableGen/SearchableTable.td" +include "llvm/IR/Intrinsics.td" + +def AMDGPUImageDMaskIntrinsicTable : GenericTable { + let FilterClass = "AMDGPUImageDMaskIntrinsic"; + let Fields = ["Intr"]; + + let PrimaryKey = ["Intr"]; + let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic"; + let PrimaryKeyEarlyOut = 1; +} -- cgit v1.2.3