[LV] Scalar with predication must not be uniform

Fix PR40816: avoid considering scalar-with-predication instructions as also uniform-after-vectorization. Instructions identified as "scalar with predication" will be "vectorized" using a replicating region. If such instructions are also optimized as "uniform after vectorization", namely when only the first of VF lanes is used, such a replicating region becomes erroneous - only the first instance of the region can and should be formed. Fix such cases by not considering such instructions as "uniform after vectorization". Differential Revision: https://reviews.llvm.org/D70298
author: Ayal Zaks <ayal.zaks@intel.com> 2019-11-27 00:08:29 +0200
committer: Ayal Zaks <ayal.zaks@intel.com> 2019-12-03 19:50:24 +0200
commit: 6ed9cef25f915d4533f261c401cee29d8d8012d5 (patch)
tree: b51f4fd4b3161bd19ad9cc333f794e69fff92167 /llvm/test/Transforms/LoopVectorize/X86
parent: 96c8024e2eb05278206b1eb59208bad0f3c68f2e (diff)
download: bcm5719-llvm-6ed9cef25f915d4533f261c401cee29d8d8012d5.tar.gz
bcm5719-llvm-6ed9cef25f915d4533f261c401cee29d8d8012d5.zip
1 files changed, 83 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index e18159f2462..93285accb06 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -65,3 +66,85 @@ for.end:
 }
 
 attributes #0 = { "target-cpu"="knl" }
+
+; CHECK-LABEL: PR40816
+;
+; Check that scalar with predication instructions are not considered uniform
+; after vectorization, because that results in replicating a region instead of
+; having a single instance (out of VF). The predication stems from a tiny count
+; of 3 leading to folding the tail by masking using icmp ule <i, i+1> <= <2, 2>.
+;
+; CHECK:     LV: Found trip count: 3
+; CHECK:     LV: Found uniform instruction:   {{%.*}} = icmp eq i32 {{%.*}}, 0
+; CHECK-NOT: LV: Found uniform instruction:   {{%.*}} = load i32, i32* {{%.*}}, align 1
+; CHECK:     LV: Found not uniform being ScalarWithPredication:  {{%.*}} = load i32, i32* {{%.*}}, align 1
+; CHECK:     LV: Found scalar instruction:   {{%.*}} = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 {{%.*}}
+;
+; FORCE-LABEL: @PR40816(
+; FORCE-NEXT:  entry:
+; FORCE-NEXT:    br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]]
+; FORCE:       vector.ph:
+; FORCE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FORCE:       vector.body:
+; FORCE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
+; FORCE-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ]
+; FORCE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; FORCE-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; FORCE-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; FORCE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; FORCE-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FORCE:       pred.store.if:
+; FORCE-NEXT:    store i32 [[TMP0]], i32* @b, align 1
+; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FORCE:       pred.store.continue:
+; FORCE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; FORCE-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; FORCE:       pred.store.if1:
+; FORCE-NEXT:    store i32 [[TMP1]], i32* @b, align 1
+; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; FORCE:       pred.store.continue2:
+; FORCE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; FORCE-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; FORCE:       pred.load.if:
+; FORCE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]]
+; FORCE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1
+; FORCE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0
+; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; FORCE:       pred.load.continue:
+; FORCE-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ undef, [[PRED_STORE_CONTINUE2]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; FORCE-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; FORCE-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
+; FORCE:       pred.load.if3:
+; FORCE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]]
+; FORCE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1
+; FORCE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
+; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; FORCE:       pred.load.continue4:
+; FORCE-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
+; FORCE-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; FORCE-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; FORCE-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; FORCE-NEXT:    br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]]
+;
+@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
+@b = external global i32, align 1
+
+define void @PR40816() #1 {
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  store i32 %0, i32* @b, align 1
+  %arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 %0
+  %1 = load i32, i32* %arrayidx1, align 1
+  %cmp2 = icmp eq i32 %1, 0
+  %inc = add nuw nsw i32 %0, 1
+  br i1 %cmp2, label %return, label %for.body
+
+return:                                           ; preds = %for.body
+  ret void
+}
+
+attributes #1 = { "target-cpu"="core2" }
author	Ayal Zaks <ayal.zaks@intel.com>	2019-11-27 00:08:29 +0200
committer	Ayal Zaks <ayal.zaks@intel.com>	2019-12-03 19:50:24 +0200
commit	6ed9cef25f915d4533f261c401cee29d8d8012d5 (patch)
tree	b51f4fd4b3161bd19ad9cc333f794e69fff92167 /llvm/test/Transforms/LoopVectorize/X86
parent	96c8024e2eb05278206b1eb59208bad0f3c68f2e (diff)
download	bcm5719-llvm-6ed9cef25f915d4533f261c401cee29d8d8012d5.tar.gz bcm5719-llvm-6ed9cef25f915d4533f261c401cee29d8d8012d5.zip