diff options
-rw-r--r-- | llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 3 | ||||
-rw-r--r-- | llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp | 18 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopLoadElim/non-consecutive.ll | 43 |
3 files changed, 58 insertions, 6 deletions
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 13aa0f53ca5..bcf4e5d52f7 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -663,7 +663,8 @@ const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, /// The \p Assume parameter indicates if we are allowed to make additional /// run-time assumptions. int isStridedPtr(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, - const ValueToValueMap &StridesMap, bool Assume = false); + const ValueToValueMap &StridesMap = ValueToValueMap(), + bool Assume = false); /// \brief Returns true if the memory operations \p A and \p B are consecutive. /// This is a simple API that does not depend on the analysis pass. diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index c6cec12133a..e4419113d45 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -61,7 +61,8 @@ struct StoreToLoadForwardingCandidate { /// \brief Return true if the dependence from the store to the load has a /// distance of one. E.g. A[i+1] = A[i] - bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const { + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, + Loop *L) const { Value *LoadPtr = Load->getPointerOperand(); Value *StorePtr = Store->getPointerOperand(); Type *LoadPtrType = LoadPtr->getType(); @@ -72,6 +73,13 @@ struct StoreToLoadForwardingCandidate { LoadType == StorePtr->getType()->getPointerElementType() && "Should be a known dependence"); + // Currently we only support accesses with unit stride. FIXME: we should be + // able to handle non unit stirde as well as long as the stride is equal to + // the dependence distance. + if (isStridedPtr(PSE, LoadPtr, L) != 1 || + isStridedPtr(PSE, LoadPtr, L) != 1) + return false; + auto &DL = Load->getParent()->getModule()->getDataLayout(); unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType)); @@ -83,7 +91,7 @@ struct StoreToLoadForwardingCandidate { auto *Dist = cast<SCEVConstant>( PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); const APInt &Val = Dist->getAPInt(); - return Val.abs() == TypeByteSize; + return Val == TypeByteSize; } Value *getLoadPtr() const { return Load->getPointerOperand(); } @@ -223,8 +231,8 @@ public: // so deciding which one forwards is easy. The later one forwards as // long as they both have a dependence distance of one to the load. if (Cand.Store->getParent() == OtherCand->Store->getParent() && - Cand.isDependenceDistanceOfOne(PSE) && - OtherCand->isDependenceDistanceOfOne(PSE)) { + Cand.isDependenceDistanceOfOne(PSE, L) && + OtherCand->isDependenceDistanceOfOne(PSE, L)) { // They are in the same block, the later one will forward to the load. if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) OtherCand = &Cand; @@ -441,7 +449,7 @@ public: // Check whether the SCEV difference is the same as the induction step, // thus we load the value in the next iteration. - if (!Cand.isDependenceDistanceOfOne(PSE)) + if (!Cand.isDependenceDistanceOfOne(PSE, L)) continue; ++NumForwarding; diff --git a/llvm/test/Transforms/LoopLoadElim/non-consecutive.ll b/llvm/test/Transforms/LoopLoadElim/non-consecutive.ll new file mode 100644 index 00000000000..43751a8ff60 --- /dev/null +++ b/llvm/test/Transforms/LoopLoadElim/non-consecutive.ll @@ -0,0 +1,43 @@ +; RUN: opt -loop-load-elim -S < %s | FileCheck %s + +; The accesses to A are independent here but LAA reports it as a loop-carried +; forward dependence. Check that we don't perform st->ld forwarding between +; them. +; +; for (unsigned i = 0; i < 100; i++) { +; A[i][1] = B[i] + 2; +; C[i] = A[i][0] * 2; +; } + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +define void @f([2 x i32]* noalias %A, i32* noalias %B, i32* noalias %C, i64 %N) { + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + + %A1idx = getelementptr inbounds [2 x i32], [2 x i32]* %A, i64 %indvars.iv, i32 1 + %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %A0idx = getelementptr inbounds [2 x i32], [2 x i32]* %A, i64 %indvars.iv, i32 0 + + %b = load i32, i32* %Bidx, align 4 + %a_p1 = add i32 %b, 2 + store i32 %a_p1, i32* %A1idx, align 4 + +; CHECK: %a = load i32, i32* %A0idx, align 4 + %a = load i32, i32* %A0idx, align 4 +; CHECK: %c = mul i32 %a, 2 + %c = mul i32 %a, 2 + store i32 %c, i32* %Cidx, align 4 + + %exitcond = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} |