[LV] fold-tail predication should be respected even with assume_safety

assume_safety implies that loads under "if's" can be safely executed speculatively (unguarded, unmasked). However this assumption holds only for the original user "if's", not those introduced by the compiler, such as the fold-tail "if" that guards us from loading beyond the original loop trip-count. Currently the combination of fold-tail and assume-safety pragmas results in ignoring the fold-tail predicate that guards the loads, generating unmasked loads. This patch fixes this behavior. Differential Revision: https://reviews.llvm.org/D66106 Reviewers: Ayal, hsaito, fhahn llvm-svn: 368973
author: Dorit Nuzman <dorit.nuzman@intel.com> 2019-08-15 07:12:14 +0000
committer: Dorit Nuzman <dorit.nuzman@intel.com> 2019-08-15 07:12:14 +0000
commit: d57d73daed3057ff48a1e6476a681b2ad46c268c (patch)
tree: 52d58fc222ecc3ccabab84a7ff96f2e42b70dd26 /llvm/test/Transforms/LoopVectorize/X86
parent: 1e246b20c038d61153c7a77b8578a188d40938e2 (diff)
download: bcm5719-llvm-d57d73daed3057ff48a1e6476a681b2ad46c268c.tar.gz
bcm5719-llvm-d57d73daed3057ff48a1e6476a681b2ad46c268c.zip
2 files changed, 177 insertions, 11 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
new file mode 100644
index 00000000000..98ca49601ba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
@@ -0,0 +1,166 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; Case1: With pragma predicate to force tail-folding.
+; All memory opertions are masked.
+;void fold_tail(int * restrict p, int * restrict q1, int * restrict q2, int guard) {
+;   #pragma clang loop vectorize_predicate(enable)
+;   for(int ix=0; ix < 1021; ++ix) {
+;     if (ix > guard) {
+;       p[ix] = q1[ix] + q2[ix];
+;     }
+;   }
+;}
+
+;CHECK-LABEL: @fold_tail
+;CHECK: vector.body:
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call void @llvm.masked.store
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @fold_tail(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2, 
+i32 %guard) local_unnamed_addr #0 {
+entry:
+  %0 = sext i32 %guard to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp1 = icmp sgt i64 %indvars.iv, %0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2
+  %add = add nsw i32 %2, %1
+  %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx5, align 4, !tbaa !2
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1021
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8
+}
+
+; Case2: With pragma assume_safety only the store is masked.
+; void assume_safety(int * p, int * q1, int * q2, int guard) {
+;   #pragma clang loop vectorize(assume_safety)
+;   for(int ix=0; ix < 1021; ++ix) {
+;     if (ix > guard) {
+;       p[ix] = q1[ix] + q2[ix];
+;     }
+;   }
+;}
+
+;CHECK-LABEL: @assume_safety
+;CHECK: vector.body:
+;CHECK-NOT: @llvm.masked.load
+;CHECK:  call void @llvm.masked.store
+
+; Function Attrs: norecurse nounwind uwtable
+define void @assume_safety(i32* nocapture, i32* nocapture readonly, i32* nocapture readonly, i32) local_unnamed_addr #0 {
+  %5 = sext i32 %3 to i64
+  br label %7
+
+; <label>:6:
+  ret void
+
+; <label>:7:
+  %8 = phi i64 [ 0, %4 ], [ %18, %17 ]
+  %9 = icmp sgt i64 %8, %5
+  br i1 %9, label %10, label %17
+
+; <label>:10:
+  %11 = getelementptr inbounds i32, i32* %1, i64 %8
+  %12 = load i32, i32* %11, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %13 = getelementptr inbounds i32, i32* %2, i64 %8
+  %14 = load i32, i32* %13, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %15 = add nsw i32 %14, %12
+  %16 = getelementptr inbounds i32, i32* %0, i64 %8
+  store i32 %15, i32* %16, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  br label %17
+
+; <label>:17:
+  %18 = add nuw nsw i64 %8, 1
+  %19 = icmp eq i64 %18, 1021
+  br i1 %19, label %6, label %7, !llvm.loop !6
+}
+
+; Case3: With pragma assume_safety and pragma predicate both the store and the
+; load are masked.
+; void fold_tail_and_assume_safety(int * p, int * q1, int * q2, int guard) {
+;   #pragma clang loop vectorize(assume_safety) vectorize_predicate(enable)
+;   for(int ix=0; ix < 1021; ++ix) {
+;     if (ix > guard) {
+;       p[ix] = q1[ix] + q2[ix];
+;     }
+;   }
+;}
+
+;CHECK-LABEL: @fold_tail_and_assume_safety
+;CHECK: vector.body:
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call void @llvm.masked.store
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @fold_tail_and_assume_safety(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2, 
+i32 %guard) local_unnamed_addr #0 {
+entry:
+  %0 = sext i32 %guard to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp1 = icmp sgt i64 %indvars.iv, %0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4, !tbaa !2, !llvm.access.group !10
+  %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2, !llvm.access.group !10
+  %add = add nsw i32 %2, %1
+  %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx5, align 4, !tbaa !2, !llvm.access.group !10
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1021
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11
+}
+
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!8 = distinct !{!8, !9}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+
+!10 = distinct !{}
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.parallel_accesses", !10}
+!13 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 7c249a1b422..1e8f1409dfb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -102,17 +102,17 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
author	Dorit Nuzman <dorit.nuzman@intel.com>	2019-08-15 07:12:14 +0000
committer	Dorit Nuzman <dorit.nuzman@intel.com>	2019-08-15 07:12:14 +0000
commit	d57d73daed3057ff48a1e6476a681b2ad46c268c (patch)
tree	52d58fc222ecc3ccabab84a7ff96f2e42b70dd26 /llvm/test/Transforms/LoopVectorize/X86
parent	1e246b20c038d61153c7a77b8578a188d40938e2 (diff)
download	bcm5719-llvm-d57d73daed3057ff48a1e6476a681b2ad46c268c.tar.gz bcm5719-llvm-d57d73daed3057ff48a1e6476a681b2ad46c268c.zip