Teach the SLP vectorizer the correct way to check for consecutive access

using GEPs. Previously, it used a number of different heuristics for analyzing the GEPs. Several of these were conservatively correct, but failed to fall back to SCEV even when SCEV might have given a reasonable answer. One was simply incorrect in how it was formulated. There was good code already to recursively evaluate the constant offsets in GEPs, look through pointer casts, etc. I gathered this into a form code like the SLP code can use in a previous commit, which allows all of this code to become quite simple. There is some performance (compile time) concern here at first glance as we're directly attempting to walk both pointers constant GEP chains. However, a couple of thoughts: 1) The very common cases where there is a dynamic pointer, and a second pointer at a constant offset (usually a stride) from it, this code will actually not do any unnecessary work. 2) InstCombine and other passes work very hard to collapse constant GEPs, so it will be rare that we iterate here for a long time. That said, if there remain performance problems here, there are some obvious things that can improve the situation immensely. Doing a vectorizer-pass-wide memoizer for each individual layer of pointer values, their base values, and the constant offset is likely to be able to completely remove redundant work and strictly limit the scaling of the work to scrape these GEPs. Since this optimization was not done on the prior version (which would still benefit from it), I've not done it here. But if folks have benchmarks that slow down it should be straight forward for them to add. I've added a test case, but I'm not really confident of the amount of testing done for different access patterns, strides, and pointer manipulation. llvm-svn: 189007
author: Chandler Carruth <chandlerc@gmail.com> 2013-08-22 12:45:17 +0000
committer: Chandler Carruth <chandlerc@gmail.com> 2013-08-22 12:45:17 +0000
commit: 1c34afcb613f8eafcb9e40b492fbcb743b4fb94c (patch)
tree: 38e15100f71897eb700baf73371c16e1ad95028a /llvm/test/Transforms/SLPVectorizer/X86/phi.ll
parent: e1de9e9c3314ef55944b163040c2436e827c7172 (diff)
download: bcm5719-llvm-1c34afcb613f8eafcb9e40b492fbcb743b4fb94c.tar.gz
bcm5719-llvm-1c34afcb613f8eafcb9e40b492fbcb743b4fb94c.zip
1 files changed, 39 insertions, 1 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 1c7f9ccf602..f77e945aad9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.9.0"
@@ -95,3 +95,41 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
+define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
+; CHECK-LABEL: @test(
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+
+entry:
+  %i1.0 = load x86_fp80* %i1, align 16
+  %i1.gep1 = getelementptr x86_fp80* %i1, i64 1
+  %i1.1 = load x86_fp80* %i1.gep1, align 16
+; CHECK: load x86_fp80*
+; CHECK: load x86_fp80*
+; CHECK: insertelement <2 x x86_fp80>
+; CHECK: insertelement <2 x x86_fp80>
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds x86_fp80* %i2, i64 0
+  %i2.0 = load x86_fp80* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds x86_fp80* %i2, i64 1
+  %i2.1 = load x86_fp80* %i2.gep1, align 16
+; CHECK: load x86_fp80*
+; CHECK: load x86_fp80*
+; CHECK: insertelement <2 x x86_fp80>
+; CHECK: insertelement <2 x x86_fp80>
+  br label %end
+
+end:
+  %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
+; CHECK: phi <2 x x86_fp80>
+; CHECK: extractelement <2 x x86_fp80>
+; CHECK: extractelement <2 x x86_fp80>
+  store x86_fp80 %phi0, x86_fp80* %o, align 16
+  %o.gep1 = getelementptr inbounds x86_fp80* %o, i64 1
+  store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
+  ret void
+}
author	Chandler Carruth <chandlerc@gmail.com>	2013-08-22 12:45:17 +0000
committer	Chandler Carruth <chandlerc@gmail.com>	2013-08-22 12:45:17 +0000
commit	1c34afcb613f8eafcb9e40b492fbcb743b4fb94c (patch)
tree	38e15100f71897eb700baf73371c16e1ad95028a /llvm/test/Transforms/SLPVectorizer/X86/phi.ll
parent	e1de9e9c3314ef55944b163040c2436e827c7172 (diff)
download	bcm5719-llvm-1c34afcb613f8eafcb9e40b492fbcb743b4fb94c.tar.gz bcm5719-llvm-1c34afcb613f8eafcb9e40b492fbcb743b4fb94c.zip