[LSV] Refactoring + supporting bitcasts to a type of different size

This is mostly a preparation work for adding a limited support for select instructions. It proved to be difficult to do due to size and irregularity of Vectorizer::isConsecutiveAccess, this is fixed here I believe. It also turned out that these changes make it simpler to finish one of the TODOs and fix a number of other small issues, namely: 1. Looking through bitcasts to a type of a different size (requires careful tracking of the original load/store size and some math converting sizes in bytes to expected differences in indices of GEPs). 2. Reusing partial analysis of pointers done by first attempt in proving them consecutive instead of starting from scratch. This added limited support for nested GEPs co-existing with difficult sext/zext instructions. This also required a careful handling of negative differences between constant parts of offsets. 3. Handing a case where the first pointer index is not an add, but something else (a function parameter for instance). I observe an increased number of successful vectorizations on a large set of shader programs. Only few shaders are affected, but those that are affected sport >5% less loads and stores than before the patch. Reviewed By: rampitec Differential-Revision: https://reviews.llvm.org/D49342 llvm-svn: 337489
author: Roman Tereshin <rtereshin@apple.com> 2018-07-19 19:42:43 +0000
committer: Roman Tereshin <rtereshin@apple.com> 2018-07-19 19:42:43 +0000
commit: b49b2a601fc3c581aed1037389463a2ce63ece4d (patch)
tree: 53c863c0e6b773068cf288d71d216e561628d3f6 /llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
parent: 0c122d5a41cd038b8e29a0d8d671257938da1633 (diff)
download: bcm5719-llvm-b49b2a601fc3c581aed1037389463a2ce63ece4d.tar.gz
bcm5719-llvm-b49b2a601fc3c581aed1037389463a2ce63ece4d.zip
1 files changed, 62 insertions, 46 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 8ce408294c0..14b73e9b61a 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -118,8 +118,6 @@ public:
   bool run();
 
 private:
-  GetElementPtrInst *getSourceGEP(Value *Src) const;
-
   unsigned getPointerAddressSpace(Value *I);
 
   unsigned getAlignment(LoadInst *LI) const {
@@ -139,6 +137,8 @@ private:
   }
 
   bool isConsecutiveAccess(Value *A, Value *B);
+  bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size);
+  bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta);
 
   /// After vectorization, reorder the instructions that I depends on
   /// (the instructions defining its operands), to ensure they dominate I.
@@ -277,21 +277,6 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) {
   return -1;
 }
 
-GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
-  // First strip pointer bitcasts. Make sure pointee size is the same with
-  // and without casts.
-  // TODO: a stride set by the add instruction below can match the difference
-  // in pointee type size here. Currently it will not be vectorized.
-  Value *SrcPtr = getLoadStorePointerOperand(Src);
-  Value *SrcBase = SrcPtr->stripPointerCasts();
-  Type *SrcPtrType = SrcPtr->getType()->getPointerElementType();
-  Type *SrcBaseType = SrcBase->getType()->getPointerElementType();
-  if (SrcPtrType->isSized() && SrcBaseType->isSized() &&
-      DL.getTypeStoreSize(SrcPtrType) == DL.getTypeStoreSize(SrcBaseType))
-    SrcPtr = SrcBase;
-  return dyn_cast<GetElementPtrInst>(SrcPtr);
-}
-
 // FIXME: Merge with llvm::isConsecutiveAccess
 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   Value *PtrA = getLoadStorePointerOperand(A);
@@ -304,7 +289,6 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
     return false;
 
   // Make sure that A and B are different pointers of the same size type.
-  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
   Type *PtrATy = PtrA->getType()->getPointerElementType();
   Type *PtrBTy = PtrB->getType()->getPointerElementType();
   if (PtrA == PtrB ||
@@ -314,10 +298,16 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
           DL.getTypeStoreSize(PtrBTy->getScalarType()))
     return false;
 
+  unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
   APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
 
-  unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
-  APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
+  return areConsecutivePointers(PtrA, PtrB, Size);
+}
+
+bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) {
+  unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
+  APInt OffsetA(PtrBitWidth, 0);
+  APInt OffsetB(PtrBitWidth, 0);
   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
 
@@ -351,68 +341,94 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   // Sometimes even this doesn't work, because SCEV can't always see through
   // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
   // things the hard way.
+  return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta);
+}
+
+bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
+                                             APInt PtrDelta) {
+  auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
+  auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
+  if (!GEPA || !GEPB)
+    return false;
 
   // Look through GEPs after checking they're the same except for the last
   // index.
-  GetElementPtrInst *GEPA = getSourceGEP(A);
-  GetElementPtrInst *GEPB = getSourceGEP(B);
-  if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
+  if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
+      GEPA->getPointerOperand() != GEPB->getPointerOperand())
     return false;
-  unsigned FinalIndex = GEPA->getNumOperands() - 1;
-  for (unsigned i = 0; i < FinalIndex; i++)
-    if (GEPA->getOperand(i) != GEPB->getOperand(i))
+  gep_type_iterator GTIA = gep_type_begin(GEPA);
+  gep_type_iterator GTIB = gep_type_begin(GEPB);
+  for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
+    if (GTIA.getOperand() != GTIB.getOperand())
       return false;
+    ++GTIA;
+    ++GTIB;
+  }
 
-  Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex));
-  Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex));
+  Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
+  Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
   if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
       OpA->getType() != OpB->getType())
     return false;
 
+  if (PtrDelta.isNegative()) {
+    if (PtrDelta.isMinSignedValue())
+      return false;
+    PtrDelta.negate();
+    std::swap(OpA, OpB);
+  }
+  uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+  if (PtrDelta.urem(Stride) != 0)
+    return false;
+  unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
+  APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+
   // Only look through a ZExt/SExt.
   if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
     return false;
 
   bool Signed = isa<SExtInst>(OpA);
 
-  OpA = dyn_cast<Instruction>(OpA->getOperand(0));
+  // At this point A could be a function parameter, i.e. not an instruction
+  Value *ValA = OpA->getOperand(0);
   OpB = dyn_cast<Instruction>(OpB->getOperand(0));
-  if (!OpA || !OpB || OpA->getType() != OpB->getType())
+  if (!OpB || ValA->getType() != OpB->getType())
     return false;
 
-  // Now we need to prove that adding 1 to OpA won't overflow.
+  // Now we need to prove that adding IdxDiff to ValA won't overflow.
   bool Safe = false;
-  // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA,
-  // we're okay.
+  // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
+  // ValA, we're okay.
   if (OpB->getOpcode() == Instruction::Add &&
       isa<ConstantInt>(OpB->getOperand(1)) &&
-      cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) {
+      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
     if (Signed)
       Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
     else
       Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
   }
 
-  unsigned BitWidth = OpA->getType()->getScalarSizeInBits();
+  unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
 
   // Second attempt:
-  // If any bits are known to be zero other than the sign bit in OpA, we can
-  // add 1 to it while guaranteeing no overflow of any sort.
+  // If all set bits of IdxDiff or any higher order bit other than the sign bit
+  // are known to be zero in ValA, we can add Diff to it while guaranteeing no
+  // overflow of any sort.
   if (!Safe) {
+    OpA = dyn_cast<Instruction>(ValA);
+    if (!OpA)
+      return false;
     KnownBits Known(BitWidth);
     computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
-    if (Known.countMaxTrailingOnes() < (BitWidth - 1))
-      Safe = true;
+    if (Known.Zero.trunc(BitWidth - 1).zext(IdxBitWidth).ult(IdxDiff))
+      return false;
   }
 
-  if (!Safe)
-    return false;
-
-  const SCEV *OffsetSCEVA = SE.getSCEV(OpA);
+  const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
   const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
-  const SCEV *One = SE.getConstant(APInt(BitWidth, 1));
-  const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One);
-  return X2 == OffsetSCEVB;
+  const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
+  const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
+  return X == OffsetSCEVB;
 }
 
 void Vectorizer::reorder(Instruction *I) {
author	Roman Tereshin <rtereshin@apple.com>	2018-07-19 19:42:43 +0000
committer	Roman Tereshin <rtereshin@apple.com>	2018-07-19 19:42:43 +0000
commit	b49b2a601fc3c581aed1037389463a2ce63ece4d (patch)
tree	53c863c0e6b773068cf288d71d216e561628d3f6 /llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
parent	0c122d5a41cd038b8e29a0d8d671257938da1633 (diff)
download	bcm5719-llvm-b49b2a601fc3c581aed1037389463a2ce63ece4d.tar.gz bcm5719-llvm-b49b2a601fc3c581aed1037389463a2ce63ece4d.zip