Use uniforms set to populate VecValuesToIgnore.

For instructions in uniform set, they will not have vector versions so add them to VecValuesToIgnore. For induction vars, those only used in uniform instructions or consecutive ptrs instructions have already been added to VecValuesToIgnore above. For those induction vars which are only used in uniform instructions or non-consecutive/non-gather scatter ptr instructions, the related phi and update will also be added into VecValuesToIgnore set. The change will make the vector RegUsages estimation less conservative. Differential Revision: https://reviews.llvm.org/D20474 llvm-svn: 275912
author: Wei Mi <wmi@google.com> 2016-07-18 20:59:53 +0000
committer: Wei Mi <wmi@google.com> 2016-07-18 20:59:53 +0000
commit: 1fd25726afcec5de0143022f6a742ff33a6bd080 (patch)
tree: 3f1d2a6c908fc4abd5d061044559d8f55bd80bd9 /llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
parent: 5f5eb58eb5c19620d669608c25a505f0f45d35d7 (diff)
download: bcm5719-llvm-1fd25726afcec5de0143022f6a742ff33a6bd080.tar.gz
bcm5719-llvm-1fd25726afcec5de0143022f6a742ff33a6bd080.zip
1 files changed, 72 insertions, 4 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
index 47a6e1029ed..6133635a8ad 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
@@ -1,9 +1,7 @@
-; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -S 2>&1 | FileCheck %s
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F
 ; REQUIRES: asserts
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
 @a = global [1024 x i8] zeroinitializer, align 16
 @b = global [1024 x i8] zeroinitializer, align 16
 
@@ -45,6 +43,45 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
+define i32 @goo() {
+; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
+; it will not have vector version and the vector register usage will not exceed the
+; available vector register number.
+; CHECK-LABEL: goo
+; CHECK:      LV(REG): VF = 4
+; CHECK-NEXT: LV(REG): Found max usage: 4
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 13
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %tmp1 = add nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %tmp1
+  %tmp = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %tmp to i32
+  %tmp2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %tmp2
+  %tmp3 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %tmp3 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %tmp4 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %tmp4, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
 define i64 @bar(i64* nocapture %a) {
 ; CHECK-LABEL: bar
 ; CHECK:       LV(REG): VF = 2
@@ -69,3 +106,34 @@ for.body:
   %exitcond = icmp eq i64 %inc, 1024
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
+
+@d = external global [0 x i64], align 8
+@e = external global [0 x i32], align 4
+@c = external global [0 x i32], align 4
+
+define void @hoo(i32 %n) {
+; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can
+; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64>
+; so the max usage of AVX512 vector register will be 2.
+; AVX512F-LABEL: bar
+; AVX512F:       LV(REG): VF = 16
+; AVX512F:       LV(REG): Found max usage: 2
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 %tmp
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 %indvars.iv
+  store i32 %tmp1, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
author	Wei Mi <wmi@google.com>	2016-07-18 20:59:53 +0000
committer	Wei Mi <wmi@google.com>	2016-07-18 20:59:53 +0000
commit	1fd25726afcec5de0143022f6a742ff33a6bd080 (patch)
tree	3f1d2a6c908fc4abd5d061044559d8f55bd80bd9 /llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
parent	5f5eb58eb5c19620d669608c25a505f0f45d35d7 (diff)
download	bcm5719-llvm-1fd25726afcec5de0143022f6a742ff33a6bd080.tar.gz bcm5719-llvm-1fd25726afcec5de0143022f6a742ff33a6bd080.zip