summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorNicolai Haehnle <nhaehnle@gmail.com>2018-06-21 13:37:31 +0000
committerNicolai Haehnle <nhaehnle@gmail.com>2018-06-21 13:37:31 +0000
commitb29ee701229ba46c160e365234c214c2d59b7702 (patch)
treea10283a83061707c7c64cd5fb0d8fe8deebcb4e8 /llvm/lib
parent1045928aab8d33bcef57e004ebdd5482468922e6 (diff)
downloadbcm5719-llvm-b29ee701229ba46c160e365234c214c2d59b7702.tar.gz
bcm5719-llvm-b29ee701229ba46c160e365234c214c2d59b7702.zip
InstCombine/AMDGPU: Add dimension-aware image intrinsics to SimplifyDemanded
Summary: Use the expanded features of the TableGen generic tables to avoid manually adding the combinatorially exploded set of intrinsics. The getAMDGPUImageDimIntrinsic lookup function is early-out, i.e. non-AMDGPU intrinsics will never look at the underlying table. Use a generic approach for getting the new intrinsic overload to keep the code simple, and make the image dmask handling more generic: - handle non-sampler image loads - handle the case where the set of demanded elements is not a prefix There is some overlap between this code and an optimization that happens in the backend during code generation. They currently complement each other: - only the codegen optimization can generate vec3 loads - only the InstCombine optimization can handle D16 The InstCombine optimization also likely covers more cases since the codegen optimization is fairly ad-hoc. Ideally, we'll remove the optimization in codegen once the infrastructure for vec3 is in place (which will probably take a long time). Modify the test cases to use dimension-aware intrinsics. This makes it easier to see that the test coverage for the new intrinsics is equivalent, and the old style intrinsics will be removed in a follow-up commit anyway. Change-Id: I4b91ea661413d13004956fe4ef7d13d41b8ce3ad Reviewers: arsenm, rampitec, majnemer Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D48165 llvm-svn: 335230
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Transforms/InstCombine/CMakeLists.txt4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp193
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineTables.td11
4 files changed, 141 insertions, 71 deletions
diff --git a/llvm/lib/Transforms/InstCombine/CMakeLists.txt b/llvm/lib/Transforms/InstCombine/CMakeLists.txt
index 5cbe804ce3e..8a3a58e9ecc 100644
--- a/llvm/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/llvm/lib/Transforms/InstCombine/CMakeLists.txt
@@ -1,3 +1,7 @@
+set(LLVM_TARGET_DEFINITIONS InstCombineTables.td)
+tablegen(LLVM InstCombineTables.inc -gen-searchable-tables)
+add_public_tablegen_target(InstCombineTableGen)
+
add_llvm_library(LLVMInstCombine
InstructionCombining.cpp
InstCombineAddSub.cpp
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index ba99035f764..23888526aae 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -706,6 +706,10 @@ private:
/// demanded bits.
bool SimplifyDemandedInstructionBits(Instruction &Inst);
+ Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
+ APInt DemandedElts,
+ int DmaskIdx = -1);
+
Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
APInt &UndefElts, unsigned Depth = 0);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8190c342c29..133561b30b4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -23,6 +23,17 @@ using namespace llvm::PatternMatch;
#define DEBUG_TYPE "instcombine"
+namespace {
+
+struct AMDGPUImageDMaskIntrinsic {
+ unsigned Intr;
+};
+
+#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
+#include "InstCombineTables.inc"
+
+} // end anonymous namespace
+
/// Check to see if the specified operand of the specified instruction is a
/// constant integer. If so, check to see if there are any bits set in the
/// constant that are not demanded. If so, shrink the constant and return true.
@@ -909,6 +920,110 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
return nullptr;
}
+/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
+Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
+ APInt DemandedElts,
+ int DMaskIdx) {
+ unsigned VWidth = II->getType()->getVectorNumElements();
+ if (VWidth == 1)
+ return nullptr;
+
+ ConstantInt *NewDMask = nullptr;
+
+ if (DMaskIdx < 0) {
+ // Pretend that a prefix of elements is demanded to simplify the code
+ // below.
+ DemandedElts = (1 << DemandedElts.getActiveBits()) - 1;
+ } else {
+ ConstantInt *DMask = dyn_cast<ConstantInt>(II->getArgOperand(DMaskIdx));
+ if (!DMask)
+ return nullptr; // non-constant dmask is not supported by codegen
+
+ unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+ // Mask off values that are undefined because the dmask doesn't cover them
+ DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
+
+ unsigned NewDMaskVal = 0;
+ unsigned OrigLoadIdx = 0;
+ for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
+ const unsigned Bit = 1 << SrcIdx;
+ if (!!(DMaskVal & Bit)) {
+ if (!!(DemandedElts & (1 << OrigLoadIdx)))
+ NewDMaskVal |= Bit;
+ OrigLoadIdx++;
+ }
+ }
+
+ if (DMaskVal != NewDMaskVal)
+ NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal);
+ }
+
+ // TODO: Handle 3 vectors when supported in code gen.
+ unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countPopulation());
+ if (!NewNumElts)
+ return UndefValue::get(II->getType());
+
+ if (NewNumElts >= VWidth && DemandedElts.isMask()) {
+ if (NewDMask)
+ II->setArgOperand(DMaskIdx, NewDMask);
+ return nullptr;
+ }
+
+ // Determine the overload types of the original intrinsic.
+ auto IID = II->getIntrinsicID();
+ SmallVector<Intrinsic::IITDescriptor, 16> Table;
+ getIntrinsicInfoTableEntries(IID, Table);
+ ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+
+ FunctionType *FTy = II->getCalledFunction()->getFunctionType();
+ SmallVector<Type *, 6> OverloadTys;
+ Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, OverloadTys);
+ for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+ Intrinsic::matchIntrinsicType(FTy->getParamType(i), TableRef, OverloadTys);
+
+ // Get the new return type overload of the intrinsic.
+ Module *M = II->getParent()->getParent()->getParent();
+ Type *EltTy = II->getType()->getVectorElementType();
+ Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts);
+
+ OverloadTys[0] = NewTy;
+ Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);
+
+ SmallVector<Value *, 16> Args;
+ for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
+ Args.push_back(II->getArgOperand(I));
+
+ if (NewDMask)
+ Args[DMaskIdx] = NewDMask;
+
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(II);
+
+ CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
+ NewCall->takeName(II);
+ NewCall->copyMetadata(*II);
+
+ if (NewNumElts == 1) {
+ return Builder.CreateInsertElement(UndefValue::get(II->getType()), NewCall,
+ DemandedElts.countTrailingZeros());
+ }
+
+ SmallVector<uint32_t, 8> EltMask;
+ unsigned NewLoadIdx = 0;
+ for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+ if (!!(DemandedElts & (1 << OrigLoadIdx)))
+ EltMask.push_back(NewLoadIdx++);
+ else
+ EltMask.push_back(NewNumElts);
+ }
+
+ Value *Shuffle =
+ Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
+
+ return Shuffle;
+}
+
/// The specified value produces a vector with any number of elements.
/// DemandedElts contains the set of elements that are actually used by the
/// caller. This method analyzes which elements of the operand are undef and
@@ -1267,8 +1382,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
if (!II) break;
switch (II->getIntrinsicID()) {
- default: break;
-
case Intrinsic::x86_xop_vfrcz_ss:
case Intrinsic::x86_xop_vfrcz_sd:
// The instructions for these intrinsics are speced to zero upper bits not
@@ -1582,79 +1695,17 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
case Intrinsic::amdgcn_image_getlod: {
- if (VWidth == 1 || !DemandedElts.isMask())
- return nullptr;
-
- // TODO: Handle 3 vectors when supported in code gen.
- unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
- if (NewNumElts == VWidth)
- return nullptr;
-
- Module *M = II->getParent()->getParent()->getParent();
- Type *EltTy = V->getType()->getVectorElementType();
-
- Type *NewTy = (NewNumElts == 1) ? EltTy :
- VectorType::get(EltTy, NewNumElts);
-
auto IID = II->getIntrinsicID();
-
bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load ||
IID == Intrinsic::amdgcn_buffer_load_format;
+ return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts,
+ IsBuffer ? -1 : 3);
+ }
+ default: {
+ if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
+ return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
- Function *NewIntrin = IsBuffer ?
- Intrinsic::getDeclaration(M, IID, NewTy) :
- // Samplers have 3 mangled types.
- Intrinsic::getDeclaration(M, IID,
- { NewTy, II->getArgOperand(0)->getType(),
- II->getArgOperand(1)->getType()});
-
- SmallVector<Value *, 5> Args;
- for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
- Args.push_back(II->getArgOperand(I));
-
- IRBuilderBase::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(II);
-
- CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
- NewCall->takeName(II);
- NewCall->copyMetadata(*II);
-
- if (!IsBuffer) {
- ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3));
- if (DMask) {
- unsigned DMaskVal = DMask->getZExtValue() & 0xf;
-
- unsigned PopCnt = 0;
- unsigned NewDMask = 0;
- for (unsigned I = 0; I < 4; ++I) {
- const unsigned Bit = 1 << I;
- if (!!(DMaskVal & Bit)) {
- if (++PopCnt > NewNumElts)
- break;
-
- NewDMask |= Bit;
- }
- }
-
- NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask));
- }
- }
-
-
- if (NewNumElts == 1) {
- return Builder.CreateInsertElement(UndefValue::get(V->getType()),
- NewCall, static_cast<uint64_t>(0));
- }
-
- SmallVector<uint32_t, 8> EltMask;
- for (unsigned I = 0; I < VWidth; ++I)
- EltMask.push_back(I);
-
- Value *Shuffle = Builder.CreateShuffleVector(
- NewCall, UndefValue::get(NewTy), EltMask);
-
- MadeChange = true;
- return Shuffle;
+ break;
}
}
break;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineTables.td b/llvm/lib/Transforms/InstCombine/InstCombineTables.td
new file mode 100644
index 00000000000..98b2adc442f
--- /dev/null
+++ b/llvm/lib/Transforms/InstCombine/InstCombineTables.td
@@ -0,0 +1,11 @@
+include "llvm/TableGen/SearchableTable.td"
+include "llvm/IR/Intrinsics.td"
+
+def AMDGPUImageDMaskIntrinsicTable : GenericTable {
+ let FilterClass = "AMDGPUImageDMaskIntrinsic";
+ let Fields = ["Intr"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "getAMDGPUImageDMaskIntrinsic";
+ let PrimaryKeyEarlyOut = 1;
+}
OpenPOWER on IntegriCloud