Recommit the patch "Use uniforms set to populate VecValuesToIgnore".

For instructions in uniform set, they will not have vector versions so add them to VecValuesToIgnore. For induction vars, those only used in uniform instructions or consecutive ptrs instructions have already been added to VecValuesToIgnore above. For those induction vars which are only used in uniform instructions or non-consecutive/non-gather scatter ptr instructions, the related phi and update will also be added into VecValuesToIgnore set. The change will make the vector RegUsages estimation less conservative. Differential Revision: https://reviews.llvm.org/D20474 The recommit fixed the testcase global_alias.ll. llvm-svn: 275936
author: Wei Mi <wmi@google.com> 2016-07-19 00:50:43 +0000
committer: Wei Mi <wmi@google.com> 2016-07-19 00:50:43 +0000
commit: 79997a24d750f3732f071ddc10c86d072b8cf735 (patch)
tree: 9c561ace367d9f14c6b33b1ddc4c9ec7359824ea /llvm
parent: fe358066eaea7db1a25a39b7dcf50cfaa7dd2c6d (diff)
download: bcm5719-llvm-79997a24d750f3732f071ddc10c86d072b8cf735.tar.gz
bcm5719-llvm-79997a24d750f3732f071ddc10c86d072b8cf735.zip
6 files changed, 136 insertions, 66 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8b85e320d3b..d42c9cb6e84 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6156,6 +6156,16 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
   return false;
 }
 
+/// Take the pointer operand from the Load/Store instruction.
+/// Returns NULL if this is not a valid Load/Store instruction.
+static Value *getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return nullptr;
+}
+
 void LoopVectorizationCostModel::collectValuesToIgnore() {
   // Ignore ephemeral values.
   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
@@ -6168,63 +6178,44 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
 
-  // Ignore induction phis that are only used in either GetElementPtr or ICmp
-  // instruction to exit loop. Induction variables usually have large types and
-  // can have big impact when estimating register usage.
-  // This is for when VF > 1.
+  // Insert uniform instruction into VecValuesToIgnore.
+  // Collect non-gather/scatter and non-consecutive ptr in NonConsecutivePtr.
+  SmallPtrSet<Instruction *, 8> NonConsecutivePtr;
+  for (auto *BB : TheLoop->getBlocks()) {
+    for (auto &I : *BB) {
+      if (Legal->isUniformAfterVectorization(&I))
+        VecValuesToIgnore.insert(&I);
+      Instruction *PI = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+      if (PI && !Legal->isConsecutivePtr(PI) &&
+          !isGatherOrScatterLegal(&I, PI, Legal))
+        NonConsecutivePtr.insert(PI);
+    }
+  }
+
+  // Ignore induction phis that are either used in uniform instructions or
+  // NonConsecutivePtr.
   for (auto &Induction : *Legal->getInductionVars()) {
     auto *PN = Induction.first;
     auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
 
-    // Check that the PHI is only used by the induction increment (UpdateV) or
-    // by GEPs. Then check that UpdateV is only used by a compare instruction,
-    // the loop header PHI, or by GEPs.
-    // FIXME: Need precise def-use analysis to determine if this instruction
-    // variable will be vectorized.
-    if (all_of(PN->users(),
-               [&](const User *U) -> bool {
-                 return U == UpdateV || isa<GetElementPtrInst>(U);
-               }) &&
-        all_of(UpdateV->users(), [&](const User *U) -> bool {
-          return U == PN || isa<ICmpInst>(U) || isa<GetElementPtrInst>(U);
-        })) {
+    if (std::all_of(PN->user_begin(), PN->user_end(),
+                    [&](User *U) -> bool {
+                      Instruction *UI = dyn_cast<Instruction>(U);
+                      return U == UpdateV || !TheLoop->contains(UI) ||
+                             Legal->isUniformAfterVectorization(UI) ||
+                             NonConsecutivePtr.count(UI);
+                    }) &&
+        std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
+                    [&](User *U) -> bool {
+                      Instruction *UI = dyn_cast<Instruction>(U);
+                      return U == PN || !TheLoop->contains(UI) ||
+                             Legal->isUniformAfterVectorization(UI) ||
+                             NonConsecutivePtr.count(UI);
+                    })) {
       VecValuesToIgnore.insert(PN);
       VecValuesToIgnore.insert(UpdateV);
     }
   }
-
-  // Ignore instructions that will not be vectorized.
-  // This is for when VF > 1.
-  for (BasicBlock *BB : TheLoop->blocks()) {
-    for (auto &Inst : *BB) {
-      switch (Inst.getOpcode())
-      case Instruction::GetElementPtr: {
-        // Ignore GEP if its last operand is an induction variable so that it is
-        // a consecutive load/store and won't be vectorized as scatter/gather
-        // pattern.
-
-        GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
-        unsigned NumOperands = Gep->getNumOperands();
-        unsigned InductionOperand = getGEPInductionOperand(Gep);
-        bool GepToIgnore = true;
-
-        // Check that all of the gep indices are uniform except for the
-        // induction operand.
-        for (unsigned i = 0; i != NumOperands; ++i) {
-          if (i != InductionOperand &&
-              !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
-                                            TheLoop)) {
-            GepToIgnore = false;
-            break;
-          }
-        }
-
-        if (GepToIgnore)
-          VecValuesToIgnore.insert(&Inst);
-        break;
-      }
-    }
-  }
 }
 
 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
index fed186b9b67..65b3919585e 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
@@ -43,7 +43,7 @@ for.end12:                                        ; preds = %for.end, %entry
 
 ; CHECK-LABEL: @s173
 ; CHECK: load <4 x float>, <4 x float>*
-; CHECK: add nsw i64 %1, 16000
+; CHECK: add i64 %index, 16000
 ; CHECK: ret i32 0
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
index 754e859cd8f..1eb1cd3f5d7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; loop.
 
 ; CHECK-LABEL: f:
-; CHECK: vmovdqu32 %zmm{{.}}, (
+; CHECK: vmovdqu32 %zmm{{.}},
 ; CHECK-NOT: %ymm
 
 define void @f(i32* %a, i32 %n) {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
index 47a6e1029ed..6133635a8ad 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
@@ -1,9 +1,7 @@
-; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -S 2>&1 | FileCheck %s
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F
 ; REQUIRES: asserts
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
 @a = global [1024 x i8] zeroinitializer, align 16
 @b = global [1024 x i8] zeroinitializer, align 16
 
@@ -45,6 +43,45 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
+define i32 @goo() {
+; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
+; it will not have vector version and the vector register usage will not exceed the
+; available vector register number.
+; CHECK-LABEL: goo
+; CHECK:      LV(REG): VF = 4
+; CHECK-NEXT: LV(REG): Found max usage: 4
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 13
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %tmp1 = add nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %tmp1
+  %tmp = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %tmp to i32
+  %tmp2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %tmp2
+  %tmp3 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %tmp3 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %tmp4 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %tmp4, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
 define i64 @bar(i64* nocapture %a) {
 ; CHECK-LABEL: bar
 ; CHECK:       LV(REG): VF = 2
@@ -69,3 +106,34 @@ for.body:
   %exitcond = icmp eq i64 %inc, 1024
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
+
+@d = external global [0 x i64], align 8
+@e = external global [0 x i32], align 4
+@c = external global [0 x i32], align 4
+
+define void @hoo(i32 %n) {
+; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can
+; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64>
+; so the max usage of AVX512 vector register will be 2.
+; AVX512F-LABEL: bar
+; AVX512F:       LV(REG): VF = 16
+; AVX512F:       LV(REG): Found max usage: 2
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 %tmp
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 %indvars.iv
+  store i32 %tmp1, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/global_alias.ll b/llvm/test/Transforms/LoopVectorize/global_alias.ll
index 0da841bcbbd..16f50951a15 100644
--- a/llvm/test/Transforms/LoopVectorize/global_alias.ll
+++ b/llvm/test/Transforms/LoopVectorize/global_alias.ll
@@ -387,7 +387,7 @@ for.end:                                          ; preds = %for.cond
 ;   return Foo.A[a];
 ; }
 ; CHECK-LABEL: define i32 @noAlias08(
-; CHECK: sub nuw nsw <4 x i32>
+; CHECK: sub <4 x i32>
 ; CHECK: ret
 
 define i32 @noAlias08(i32 %a) #0 {
@@ -439,7 +439,7 @@ for.end:                                          ; preds = %for.cond
 ;   return Foo.A[a];
 ; }
 ; CHECK-LABEL: define i32 @noAlias09(
-; CHECK: sub nuw nsw <4 x i32>
+; CHECK: sub <4 x i32>
 ; CHECK: ret
 
 define i32 @noAlias09(i32 %a) #0 {
@@ -721,7 +721,7 @@ for.end:                                          ; preds = %for.cond
 ;   return Foo.A[a];
 ; }
 ; CHECK-LABEL: define i32 @noAlias14(
-; CHECK: sub nuw nsw <4 x i32>
+; CHECK: sub <4 x i32>
 ; CHECK: ret
 
 define i32 @noAlias14(i32 %a) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
index 24ffb6167de..ae8f9b3390d 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -118,11 +118,16 @@ loopend:
 ; }
 
 ; CHECK-LABEL: @reverse_forward_induction_i64_i8(
-; CHECK: vector.body
 ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %vec.ind = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, %vector.ph ]
-; CHECK: %step.add = add <4 x i64> %vec.ind, <i64 -4, i64 -4, i64 -4, i64 -4>
-; CHECK: trunc i64 %index to i8
+; CHECK: %offset.idx = sub i64 1023, %index
+; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
+; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
+; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
+; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
+; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
+; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
+; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
+; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
 
 define void @reverse_forward_induction_i64_i8() {
 entry:
@@ -145,10 +150,16 @@ while.end:
 }
 
 ; CHECK-LABEL: @reverse_forward_induction_i64_i8_signed(
-; CHECK: vector.body:
-; CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK: %vec.ind = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, %vector.ph ]
-; CHECK: %step.add = add <4 x i64> %vec.ind, <i64 -4, i64 -4, i64 -4, i64 -4>
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %offset.idx = sub i64 1023, %index
+; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
+; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1
+; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2
+; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3
+; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
+; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5
+; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6
+; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7
 
 define void @reverse_forward_induction_i64_i8_signed() {
 entry:
author	Wei Mi <wmi@google.com>	2016-07-19 00:50:43 +0000
committer	Wei Mi <wmi@google.com>	2016-07-19 00:50:43 +0000
commit	79997a24d750f3732f071ddc10c86d072b8cf735 (patch)
tree	9c561ace367d9f14c6b33b1ddc4c9ec7359824ea /llvm
parent	fe358066eaea7db1a25a39b7dcf50cfaa7dd2c6d (diff)
download	bcm5719-llvm-79997a24d750f3732f071ddc10c86d072b8cf735.tar.gz bcm5719-llvm-79997a24d750f3732f071ddc10c86d072b8cf735.zip