diff options
author | Roman Tereshin <rtereshin@apple.com> | 2018-07-19 19:42:43 +0000 |
---|---|---|
committer | Roman Tereshin <rtereshin@apple.com> | 2018-07-19 19:42:43 +0000 |
commit | b49b2a601fc3c581aed1037389463a2ce63ece4d (patch) | |
tree | 53c863c0e6b773068cf288d71d216e561628d3f6 /llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | |
parent | 0c122d5a41cd038b8e29a0d8d671257938da1633 (diff) | |
download | bcm5719-llvm-b49b2a601fc3c581aed1037389463a2ce63ece4d.tar.gz bcm5719-llvm-b49b2a601fc3c581aed1037389463a2ce63ece4d.zip |
[LSV] Refactoring + supporting bitcasts to a type of different size
This is mostly a preparation work for adding a limited support for
select instructions. It proved to be difficult to do due to size and
irregularity of Vectorizer::isConsecutiveAccess, this is fixed here I
believe.
It also turned out that these changes make it simpler to finish one of
the TODOs and fix a number of other small issues, namely:
1. Looking through bitcasts to a type of a different size (requires
careful tracking of the original load/store size and some math
converting sizes in bytes to expected differences in indices of GEPs).
2. Reusing partial analysis of pointers done by first attempt in proving
them consecutive instead of starting from scratch. This added limited
support for nested GEPs co-existing with difficult sext/zext
instructions. This also required a careful handling of negative
differences between constant parts of offsets.
3. Handing a case where the first pointer index is not an add, but
something else (a function parameter for instance).
I observe an increased number of successful vectorizations on a large
set of shader programs. Only few shaders are affected, but those that
are affected sport >5% less loads and stores than before the patch.
Reviewed By: rampitec
Differential-Revision: https://reviews.llvm.org/D49342
llvm-svn: 337489
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 108 |
1 files changed, 62 insertions, 46 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 8ce408294c0..14b73e9b61a 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -118,8 +118,6 @@ public: bool run(); private: - GetElementPtrInst *getSourceGEP(Value *Src) const; - unsigned getPointerAddressSpace(Value *I); unsigned getAlignment(LoadInst *LI) const { @@ -139,6 +137,8 @@ private: } bool isConsecutiveAccess(Value *A, Value *B); + bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size); + bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta); /// After vectorization, reorder the instructions that I depends on /// (the instructions defining its operands), to ensure they dominate I. @@ -277,21 +277,6 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) { return -1; } -GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const { - // First strip pointer bitcasts. Make sure pointee size is the same with - // and without casts. - // TODO: a stride set by the add instruction below can match the difference - // in pointee type size here. Currently it will not be vectorized. - Value *SrcPtr = getLoadStorePointerOperand(Src); - Value *SrcBase = SrcPtr->stripPointerCasts(); - Type *SrcPtrType = SrcPtr->getType()->getPointerElementType(); - Type *SrcBaseType = SrcBase->getType()->getPointerElementType(); - if (SrcPtrType->isSized() && SrcBaseType->isSized() && - DL.getTypeStoreSize(SrcPtrType) == DL.getTypeStoreSize(SrcBaseType)) - SrcPtr = SrcBase; - return dyn_cast<GetElementPtrInst>(SrcPtr); -} - // FIXME: Merge with llvm::isConsecutiveAccess bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { Value *PtrA = getLoadStorePointerOperand(A); @@ -304,7 +289,6 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { return false; // Make sure that A and B are different pointers of the same size type. - unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); Type *PtrATy = PtrA->getType()->getPointerElementType(); Type *PtrBTy = PtrB->getType()->getPointerElementType(); if (PtrA == PtrB || @@ -314,10 +298,16 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { DL.getTypeStoreSize(PtrBTy->getScalarType())) return false; + unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA); APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); - unsigned IdxWidth = DL.getIndexSizeInBits(ASA); - APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); + return areConsecutivePointers(PtrA, PtrB, Size); +} + +bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) { + unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); + APInt OffsetA(PtrBitWidth, 0); + APInt OffsetB(PtrBitWidth, 0); PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); @@ -351,68 +341,94 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { // Sometimes even this doesn't work, because SCEV can't always see through // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking // things the hard way. + return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta); +} + +bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB, + APInt PtrDelta) { + auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA); + auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB); + if (!GEPA || !GEPB) + return false; // Look through GEPs after checking they're the same except for the last // index. - GetElementPtrInst *GEPA = getSourceGEP(A); - GetElementPtrInst *GEPB = getSourceGEP(B); - if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands()) + if (GEPA->getNumOperands() != GEPB->getNumOperands() || + GEPA->getPointerOperand() != GEPB->getPointerOperand()) return false; - unsigned FinalIndex = GEPA->getNumOperands() - 1; - for (unsigned i = 0; i < FinalIndex; i++) - if (GEPA->getOperand(i) != GEPB->getOperand(i)) + gep_type_iterator GTIA = gep_type_begin(GEPA); + gep_type_iterator GTIB = gep_type_begin(GEPB); + for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) { + if (GTIA.getOperand() != GTIB.getOperand()) return false; + ++GTIA; + ++GTIB; + } - Instruction *OpA = dyn_cast<Instruction>(GEPA->getOperand(FinalIndex)); - Instruction *OpB = dyn_cast<Instruction>(GEPB->getOperand(FinalIndex)); + Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand()); + Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand()); if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() || OpA->getType() != OpB->getType()) return false; + if (PtrDelta.isNegative()) { + if (PtrDelta.isMinSignedValue()) + return false; + PtrDelta.negate(); + std::swap(OpA, OpB); + } + uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType()); + if (PtrDelta.urem(Stride) != 0) + return false; + unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits(); + APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth); + // Only look through a ZExt/SExt. if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA)) return false; bool Signed = isa<SExtInst>(OpA); - OpA = dyn_cast<Instruction>(OpA->getOperand(0)); + // At this point A could be a function parameter, i.e. not an instruction + Value *ValA = OpA->getOperand(0); OpB = dyn_cast<Instruction>(OpB->getOperand(0)); - if (!OpA || !OpB || OpA->getType() != OpB->getType()) + if (!OpB || ValA->getType() != OpB->getType()) return false; - // Now we need to prove that adding 1 to OpA won't overflow. + // Now we need to prove that adding IdxDiff to ValA won't overflow. bool Safe = false; - // First attempt: if OpB is an add with NSW/NUW, and OpB is 1 added to OpA, - // we're okay. + // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to + // ValA, we're okay. if (OpB->getOpcode() == Instruction::Add && isa<ConstantInt>(OpB->getOperand(1)) && - cast<ConstantInt>(OpB->getOperand(1))->getSExtValue() > 0) { + IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) { if (Signed) Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap(); else Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap(); } - unsigned BitWidth = OpA->getType()->getScalarSizeInBits(); + unsigned BitWidth = ValA->getType()->getScalarSizeInBits(); // Second attempt: - // If any bits are known to be zero other than the sign bit in OpA, we can - // add 1 to it while guaranteeing no overflow of any sort. + // If all set bits of IdxDiff or any higher order bit other than the sign bit + // are known to be zero in ValA, we can add Diff to it while guaranteeing no + // overflow of any sort. if (!Safe) { + OpA = dyn_cast<Instruction>(ValA); + if (!OpA) + return false; KnownBits Known(BitWidth); computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); - if (Known.countMaxTrailingOnes() < (BitWidth - 1)) - Safe = true; + if (Known.Zero.trunc(BitWidth - 1).zext(IdxBitWidth).ult(IdxDiff)) + return false; } - if (!Safe) - return false; - - const SCEV *OffsetSCEVA = SE.getSCEV(OpA); + const SCEV *OffsetSCEVA = SE.getSCEV(ValA); const SCEV *OffsetSCEVB = SE.getSCEV(OpB); - const SCEV *One = SE.getConstant(APInt(BitWidth, 1)); - const SCEV *X2 = SE.getAddExpr(OffsetSCEVA, One); - return X2 == OffsetSCEVB; + const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth)); + const SCEV *X = SE.getAddExpr(OffsetSCEVA, C); + return X == OffsetSCEVB; } void Vectorizer::reorder(Instruction *I) { |