diff options
| author | Ayal Zaks <ayal.zaks@intel.com> | 2019-08-28 09:02:23 +0000 |
|---|---|---|
| committer | Ayal Zaks <ayal.zaks@intel.com> | 2019-08-28 09:02:23 +0000 |
| commit | d15df0ede5898f83a9157fa5985386bd0b17e2c0 (patch) | |
| tree | bbb25cbe6f4633a053aeb66ca3cc949abbe0dd37 /llvm/test/Transforms/LoopVectorize/X86 | |
| parent | 8fbe81fb29e5c7f7d5e68e82063d43410121e6c4 (diff) | |
| download | bcm5719-llvm-d15df0ede5898f83a9157fa5985386bd0b17e2c0.tar.gz bcm5719-llvm-d15df0ede5898f83a9157fa5985386bd0b17e2c0.zip | |
[LV] Fold tail by masking - handle reductions
Allow vectorizing loops that have reductions when tail is folded by masking.
A select is introduced in VPlan, choosing between the last value carried by the
loop-exit/live-out instruction of the reduction, and the penultimate value
carried by the reduction phi, according to the "i < n" mask of fold-tail.
This select replaces the last value as the live-out value of the loop.
Differential Revision: https://reviews.llvm.org/D66720
llvm-svn: 370173
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize/X86')
| -rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll index eb0b499f512..d9db8a243a3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -78,6 +78,62 @@ for.body: br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 } +; Check that fold tail under optsize passes the reduction live-out value +; through a select. +; int reduction_i32(int *A, int *B, int N) { +; int sum = 0; +; for (int i = 0; i < N; ++i) +; sum += (A[i] + B[i]); +; return sum; +; } +; +define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) #0 { +; CHECK-LABEL: @reduction_i32( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACCUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ [[ACCUM:%.*]], %vector.body ] +; CHECK: [[ICMPULE:%.*]] = icmp ule <8 x i64> +; CHECK: [[LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef) +; CHECK: [[LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef) +; CHECK-NEXT: [[ADD:%.*]] = add nsw <8 x i32> [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: [[ACCUM]] = add nuw nsw <8 x i32> [[ADD]], [[ACCUM_PHI]] +; CHECK: [[LIVEOUT:%.*]] = select <8 x i1> [[ICMPULE]], <8 x i32> [[ACCUM]], <8 x i32> [[ACCUM_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[LIVEOUT]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[LIVEOUT]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> +; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF4]] +; CHECK-NEXT: [[RDX_SHUF6:%.*]] = shufflevector <8 x i32> [[BIN_RDX5]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i32> [[BIN_RDX5]], [[RDX_SHUF6]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[BIN_RDX7]], i32 0 +; CHECK-NEXT: br i1 true, label %for.cond.cleanup, label %scalar.ph +; CHECK: scalar.ph: +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ {{.*}}, %for.body ], [ [[TMP17]], %middle.block ] +; CHECK-NEXT: ret i32 [[SUM_1_LCSSA]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.0 = phi i32 [ %sum.1, %for.body ], [ 0, %entry ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidxA = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidxA, align 4 + %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %1 = load i32, i32* %arrayidxB, align 4 + %add = add nsw i32 %1, %0 + %sum.1 = add nuw nsw i32 %add, %sum.0 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %sum.1 +} + ; CHECK: !0 = distinct !{!0, !1} ; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-NEXT: !2 = distinct !{!2, !3, !1} |

