diff options
author | Dorit Nuzman <dorit.nuzman@intel.com> | 2019-08-15 07:12:14 +0000 |
---|---|---|
committer | Dorit Nuzman <dorit.nuzman@intel.com> | 2019-08-15 07:12:14 +0000 |
commit | d57d73daed3057ff48a1e6476a681b2ad46c268c (patch) | |
tree | 52d58fc222ecc3ccabab84a7ff96f2e42b70dd26 /llvm/test/Transforms/LoopVectorize/X86 | |
parent | 1e246b20c038d61153c7a77b8578a188d40938e2 (diff) | |
download | bcm5719-llvm-d57d73daed3057ff48a1e6476a681b2ad46c268c.tar.gz bcm5719-llvm-d57d73daed3057ff48a1e6476a681b2ad46c268c.zip |
[LV] fold-tail predication should be respected even with assume_safety
assume_safety implies that loads under "if's" can be safely executed
speculatively (unguarded, unmasked). However this assumption holds only for the
original user "if's", not those introduced by the compiler, such as the
fold-tail "if" that guards us from loading beyond the original loop trip-count.
Currently the combination of fold-tail and assume-safety pragmas results in
ignoring the fold-tail predicate that guards the loads, generating unmasked
loads. This patch fixes this behavior.
Differential Revision: https://reviews.llvm.org/D66106
Reviewers: Ayal, hsaito, fhahn
llvm-svn: 368973
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize/X86')
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll | 166 | ||||
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll | 22 |
2 files changed, 177 insertions, 11 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll new file mode 100644 index 00000000000..98ca49601ba --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll @@ -0,0 +1,166 @@ +; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +; Case1: With pragma predicate to force tail-folding. +; All memory opertions are masked. +;void fold_tail(int * restrict p, int * restrict q1, int * restrict q2, int guard) { +; #pragma clang loop vectorize_predicate(enable) +; for(int ix=0; ix < 1021; ++ix) { +; if (ix > guard) { +; p[ix] = q1[ix] + q2[ix]; +; } +; } +;} + +;CHECK-LABEL: @fold_tail +;CHECK: vector.body: +;CHECK: call <8 x i32> @llvm.masked.load +;CHECK: call <8 x i32> @llvm.masked.load +;CHECK: call void @llvm.masked.store + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @fold_tail(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2, +i32 %guard) local_unnamed_addr #0 { +entry: + %0 = sext i32 %guard to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %cmp1 = icmp sgt i64 %indvars.iv, %0 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4, !tbaa !2 + %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv + %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2 + %add = add nsw i32 %2, %1 + %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv + store i32 %add, i32* %arrayidx5, align 4, !tbaa !2 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1021 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8 +} + +; Case2: With pragma assume_safety only the store is masked. +; void assume_safety(int * p, int * q1, int * q2, int guard) { +; #pragma clang loop vectorize(assume_safety) +; for(int ix=0; ix < 1021; ++ix) { +; if (ix > guard) { +; p[ix] = q1[ix] + q2[ix]; +; } +; } +;} + +;CHECK-LABEL: @assume_safety +;CHECK: vector.body: +;CHECK-NOT: @llvm.masked.load +;CHECK: call void @llvm.masked.store + +; Function Attrs: norecurse nounwind uwtable +define void @assume_safety(i32* nocapture, i32* nocapture readonly, i32* nocapture readonly, i32) local_unnamed_addr #0 { + %5 = sext i32 %3 to i64 + br label %7 + +; <label>:6: + ret void + +; <label>:7: + %8 = phi i64 [ 0, %4 ], [ %18, %17 ] + %9 = icmp sgt i64 %8, %5 + br i1 %9, label %10, label %17 + +; <label>:10: + %11 = getelementptr inbounds i32, i32* %1, i64 %8 + %12 = load i32, i32* %11, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %13 = getelementptr inbounds i32, i32* %2, i64 %8 + %14 = load i32, i32* %13, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + %15 = add nsw i32 %14, %12 + %16 = getelementptr inbounds i32, i32* %0, i64 %8 + store i32 %15, i32* %16, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 + br label %17 + +; <label>:17: + %18 = add nuw nsw i64 %8, 1 + %19 = icmp eq i64 %18, 1021 + br i1 %19, label %6, label %7, !llvm.loop !6 +} + +; Case3: With pragma assume_safety and pragma predicate both the store and the +; load are masked. +; void fold_tail_and_assume_safety(int * p, int * q1, int * q2, int guard) { +; #pragma clang loop vectorize(assume_safety) vectorize_predicate(enable) +; for(int ix=0; ix < 1021; ++ix) { +; if (ix > guard) { +; p[ix] = q1[ix] + q2[ix]; +; } +; } +;} + +;CHECK-LABEL: @fold_tail_and_assume_safety +;CHECK: vector.body: +;CHECK: call <8 x i32> @llvm.masked.load +;CHECK: call <8 x i32> @llvm.masked.load +;CHECK: call void @llvm.masked.store + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @fold_tail_and_assume_safety(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2, +i32 %guard) local_unnamed_addr #0 { +entry: + %0 = sext i32 %guard to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] + %cmp1 = icmp sgt i64 %indvars.iv, %0 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4, !tbaa !2, !llvm.access.group !10 + %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv + %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2, !llvm.access.group !10 + %add = add nsw i32 %2, %1 + %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv + store i32 %add, i32* %arrayidx5, align 4, !tbaa !2, !llvm.access.group !10 + br label %for.inc + +for.inc: + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1021 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11 +} + +attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)"} +!2 = !{!3, !3, i64 0} +!3 = !{!"int", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"} +!6 = distinct !{!6, !7} +!7 = !{!"llvm.loop.vectorize.enable", i1 true} + +!8 = distinct !{!8, !9} +!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} + +!10 = distinct !{} +!11 = distinct !{!11, !12, !13} +!12 = !{!"llvm.loop.parallel_accesses", !10} +!13 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll index 7c249a1b422..1e8f1409dfb 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -102,17 +102,17 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r ; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]]) +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6 +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7 |