diff options
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 21 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/consecutive-ptr-cg-bug.ll | 68 |
2 files changed, 84 insertions, 5 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4ea41157a09..fbcdc0df0f1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1612,6 +1612,8 @@ public: /// 0 - Stride is unknown or non-consecutive. /// 1 - Address is consecutive. /// -1 - Address is consecutive, and decreasing. + /// NOTE: This method must only be used before modifying the original scalar + /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). int isConsecutivePtr(Value *Ptr); /// Returns true if the value V is uniform within the loop. @@ -1966,7 +1968,8 @@ public: /// Decision that was taken during cost calculation for memory instruction. enum InstWidening { CM_Unknown, - CM_Widen, + CM_Widen, // For consecutive accesses with stride +1. + CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, CM_Scalarize @@ -3204,8 +3207,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. - int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); - bool Reverse = ConsecutiveStride < 0; + bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse); + bool ConsecutiveStride = + Reverse || (Decision == LoopVectorizationCostModel::CM_Widen); bool CreateGatherScatter = (Decision == LoopVectorizationCostModel::CM_GatherScatter); @@ -5711,6 +5715,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { "Widening decision should be ready at this moment"); return (WideningDecision == CM_Widen || + WideningDecision == CM_Widen_Reverse || WideningDecision == CM_Interleave); }; // Iterate over the instructions in the loop, and collect all @@ -7180,7 +7185,12 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // We assume that widening is the best solution when possible. if (Legal->memoryInstructionCanBeWidened(&I, VF)) { unsigned Cost = getConsecutiveMemOpCost(&I, VF); - setWideningDecision(&I, VF, CM_Widen, Cost); + int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I)); + assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && + "Expected consecutive stride."); + InstWidening Decision = + ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; + setWideningDecision(&I, VF, Decision, Cost); continue; } @@ -7270,7 +7280,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { // by cost functions, but since this involves the task of finding out // if the loaded register is involved in an address computation, it is // instead changed here when we know this is the case. - if (getWideningDecision(I, VF) == CM_Widen) + InstWidening Decision = getWideningDecision(I, VF); + if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. setWideningDecision(I, VF, CM_Scalarize, (VF * getMemoryInstructionCost(I, 1))); diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-cg-bug.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-cg-bug.ll new file mode 100644 index 00000000000..456271ea1aa --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-cg-bug.ll @@ -0,0 +1,68 @@ +; RUN: opt -loop-vectorize -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +; PR34965/D39346 + +; LV retains the original scalar loop intact as remainder loop. However, +; after this transformation, analysis information concerning the remainder +; loop may differ from the original scalar loop. This test is an example of +; that behaviour, where values inside the remainder loop which SCEV could +; originally analyze now require flow-sensitive analysis currently not +; supported in SCEV. In particular, during LV code generation, after turning +; the original scalar loop into the remainder loop, LV expected +; Legal->isConsecutivePtr() to be consistent and return the same output as +; during legal/cost model phases (original scalar loop). Unfortunately, that +; condition was not satisfied because of the aforementioned SCEV limitation. +; After D39346, LV code generation doesn't rely on Legal->isConsecutivePtr(), +; i.e., SCEV. This test verifies that LV is able to handle the described cases. +; +; TODO: The SCEV limitation described before may affect plans to further +; optimize the remainder loop of this particular test case. One tentative +; solution is to detect the problematic IVs in LV (%7 and %8) and perform an +; in-place IV optimization by replacing: +; %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] with +; with +; %8 = sub i32 %7, 1. + + +; Verify that store is vectorized as stride-1 memory access. + +; CHECK: vector.body: +; CHECK: store <4 x i32> + +; Function Attrs: uwtable +define void @test() { + br label %.outer + +; <label>:1: ; preds = %2 + ret void + +; <label>:2: ; preds = %._crit_edge.loopexit + %3 = add nsw i32 %.ph, -2 + br i1 undef, label %1, label %.outer + +.outer: ; preds = %2, %0 + %.ph = phi i32 [ %3, %2 ], [ 336, %0 ] + %.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ] + %4 = and i32 %.ph, 30 + %5 = add i32 %.ph2, 1 + br label %6 + +; <label>:6: ; preds = %6, %.outer + %7 = phi i32 [ %5, %.outer ], [ %13, %6 ] + %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] + %9 = add i32 %8, 2 + %10 = zext i32 %9 to i64 + %11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10 + %12 = ashr i32 undef, %4 + store i32 %12, i32 addrspace(1)* %11, align 4 + %13 = add i32 %7, 1 + %14 = icmp sgt i32 %13, 61 + br i1 %14, label %._crit_edge.loopexit, label %6 + +._crit_edge.loopexit: ; preds = %._crit_edge.loopexit, %6 + br i1 undef, label %2, label %._crit_edge.loopexit +} + |