summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2017-03-01 10:50:44 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2017-03-01 10:50:44 +0000
commitdfec81107f2fbc804786b90523074194d70d8c30 (patch)
tree6f20771b2aa6dd785c52b75965849113abaa0bd5
parenteab6cd474c478942949942d844da0ed51f31e394 (diff)
downloadbcm5719-llvm-dfec81107f2fbc804786b90523074194d70d8c30.tar.gz
bcm5719-llvm-dfec81107f2fbc804786b90523074194d70d8c30.zip
[SLP] Fix for PR32038: extra add of PHI node when it is not required.
Summary: If horizontal reduction tree starts from the binary operation that is used in PHI node, but this PHI is not used in horizontal reduction, we may end up with extra addition of this PHI node after vectorization. Here is an example: ``` %phi = phi i32 [ %tmp, %end], ... ... %tmp = add i32 %tmp1, %tmp2 end: ``` after vectorization we always have something like: ``` %phi = phi i32 [ %tmp, %end], ... ... %red = extractelement <8 x 32> %vec.red, 0 %tmp = add i32 %red, %phi end: ``` even if `%phi` is not used in reduction tree. Patch considers these PHI nodes as extra arguments and considers them in the final result iff they really used in reduction. Reviewers: mkuper, hfinkel, mzolotukhin Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D30409 llvm-svn: 296606
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp20
-rw-r--r--llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll20
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll8
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll2
4 files changed, 18 insertions, 32 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9c6cc503ef0..c6d26abc76a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4250,14 +4250,6 @@ class HorizontalReduction {
MapVector<Instruction *, Value *> ExtraArgs;
BinaryOperator *ReductionRoot = nullptr;
- // After successfull horizontal reduction vectorization attempt for PHI node
- // vectorizer tries to update root binary op by combining vectorized tree and
- // the ReductionPHI node. But during vectorization this ReductionPHI can be
- // vectorized itself and replaced by the undef value, while the instruction
- // itself is marked for deletion. This 'marked for deletion' PHI node then can
- // be used in new binary operation, causing "Use still stuck around after Def
- // is destroyed" crash upon PHI node deletion.
- WeakVH ReductionPHI;
/// The opcode of the reduction.
Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
@@ -4318,7 +4310,6 @@ public:
ReductionOpcode = B->getOpcode();
ReducedValueOpcode = 0;
ReductionRoot = B;
- ReductionPHI = Phi;
// We currently only support adds.
if ((ReductionOpcode != Instruction::Add &&
@@ -4406,9 +4397,9 @@ public:
Stack.push_back(std::make_pair(I, 0));
continue;
}
- // NextV is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), NextV);
}
+ // NextV is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), NextV);
}
return true;
}
@@ -4497,12 +4488,7 @@ public:
}
}
// Update users.
- if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
- assert(ReductionRoot && "Need a reduction operation");
- ReductionRoot->setOperand(0, VectorizedTree);
- ReductionRoot->setOperand(1, ReductionPHI);
- } else
- ReductionRoot->replaceAllUsesWith(VectorizedTree);
+ ReductionRoot->replaceAllUsesWith(VectorizedTree);
}
return VectorizedTree != nullptr;
}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index 4085e099b90..b7fa5452f25 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -9,7 +9,7 @@ target triple = "aarch64--linux-gnu"
@a = common global [80 x i8] zeroinitializer, align 16
; DEFAULT-LABEL: @PR28330(
-; DEFAULT: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+; DEFAULT: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]]
@@ -18,10 +18,10 @@ target triple = "aarch64--linux-gnu"
; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
-; DEFAULT: %tmp34 = add i32 %[[R6]], %tmp17
+; DEFAULT: %bin.extra = add i32 %[[R6]], %tmp17
;
; GATHER-LABEL: @PR28330(
-; GATHER: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+; GATHER: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
; GATHER: %tmp19 = select i1 %tmp1, i32 -720, i32 -80
; GATHER: %tmp21 = select i1 %tmp3, i32 -720, i32 -80
; GATHER: %tmp23 = select i1 %tmp5, i32 -720, i32 -80
@@ -45,7 +45,7 @@ target triple = "aarch64--linux-gnu"
; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
-; GATHER: %tmp34 = add i32 %[[R6]], %tmp17
+; GATHER: %bin.extra = add i32 %[[R6]], %tmp17
;
; MAX-COST-LABEL: @PR28330(
; MAX-COST-NOT: shufflevector
@@ -98,7 +98,7 @@ define void @PR32038(i32 %n) {
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; DEFAULT-NEXT: br label [[FOR_BODY:%.*]]
; DEFAULT: for.body:
-; DEFAULT-NEXT: [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; DEFAULT-NEXT: [[TMP20:%.*]] = add i32 -5, undef
; DEFAULT-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], undef
@@ -114,8 +114,8 @@ define void @PR32038(i32 %n) {
; DEFAULT-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; DEFAULT-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; DEFAULT-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP3]], -5
-; DEFAULT-NEXT: [[TMP34]] = add i32 [[BIN_EXTRA]], [[TMP17]]
+; DEFAULT-NEXT: [[BIN_EXTRA]] = add i32 [[TMP3]], -5
+; DEFAULT-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], undef
; DEFAULT-NEXT: br label [[FOR_BODY]]
;
; GATHER-LABEL: @PR32038(
@@ -138,7 +138,7 @@ define void @PR32038(i32 %n) {
; GATHER-NEXT: [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
; GATHER-NEXT: br label [[FOR_BODY:%.*]]
; GATHER: for.body:
-; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT: [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; GATHER-NEXT: [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80
; GATHER-NEXT: [[TMP20:%.*]] = add i32 -5, [[TMP19]]
; GATHER-NEXT: [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80
@@ -169,8 +169,8 @@ define void @PR32038(i32 %n) {
; GATHER-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; GATHER-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; GATHER-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP8]], -5
-; GATHER-NEXT: [[TMP34]] = add i32 [[BIN_EXTRA]], [[TMP17]]
+; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP8]], -5
+; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
; GATHER-NEXT: br label [[FOR_BODY]]
;
; MAX-COST-LABEL: @PR32038(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
index 1ffbe0d87bf..8370c52e2ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -14,7 +14,7 @@ define i32 @test(i32* nocapture readonly %p) {
; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* %p, i64 7
; CHECK-NEXT: br label %for.body
; CHECK: for.body:
-; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
+; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, %entry ], [ %bin.extra, %for.body ]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* %p to <8 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, [[TMP1]]
@@ -32,10 +32,10 @@ define i32 @test(i32* nocapture readonly %p) {
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[TMP4]], [[SUM]]
-; CHECK-NEXT: br i1 true, label %for.end, label %for.body
+; CHECK-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP4]], [[SUM]]
+; CHECK: br i1 true, label %for.end, label %for.body
; CHECK: for.end:
-; CHECK-NEXT: ret i32 [[ADD_7]]
+; CHECK-NEXT: ret i32 [[BIN_EXTRA]]
;
entry:
%arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
index 5377ee82cf9..a01646b9dbe 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -12,7 +12,7 @@ define i32 @foo(i32* nocapture readonly %diff) #0 {
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK-NEXT: [[ADD52:%.*]] = add nsw i32 [[TMP15]],
+; CHECK: [[ADD52:%.*]] = add i32 [[TMP15]],
; CHECK: ret i32 [[ADD52]]
;
entry:
OpenPOWER on IntegriCloud