summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/LoadStoreVectorizer
diff options
context:
space:
mode:
authorAlina Sbirlea <asbirlea@google.com>2016-08-30 23:53:59 +0000
committerAlina Sbirlea <asbirlea@google.com>2016-08-30 23:53:59 +0000
commit3f8f7840bf12ffa4bfd558e5115acbd66b39280a (patch)
treea20c66b655f152f81902a1037d847e83c3a3402f /llvm/test/Transforms/LoadStoreVectorizer
parentfdb32d566a5e81deb9a3e4c0714f74337edcfbb7 (diff)
downloadbcm5719-llvm-3f8f7840bf12ffa4bfd558e5115acbd66b39280a.tar.gz
bcm5719-llvm-3f8f7840bf12ffa4bfd558e5115acbd66b39280a.zip
[LoadStoreVectorizer] Change VectorSet to Vector to match head and tail positions. Resolves PR29148.
Summary: LSV was using two vector sets (heads and tails) to track pairs of adjiacent position to vectorize. A recent optimization is trying to obtain the longest chain to vectorize and assumes the positions in heads(H) and tails(T) match, which is not the case is there are multiple tails for the same head. e.g.: i1: store a[0] i2: store a[1] i3: store a[1] Leads to: H: i1 T: i2 i3 Instead of: H: i1 i1 T: i2 i3 So the positions for instructions that follow i3 will have different indexes in H/T. This patch resolves PR29148. This issue also surfaced the fact that if the chain is too long, and TLI returns a "not-fast" answer, the whole chain will be abandoned for vectorization, even though a smaller one would be beneficial. Added a testcase and FIXME for this. Reviewers: tstellarAMD, arsenm, jlebar Subscribers: mzolotukhin, wdng, llvm-commits Differential Revision: https://reviews.llvm.org/D24057 llvm-svn: 280179
Diffstat (limited to 'llvm/test/Transforms/LoadStoreVectorizer')
-rw-r--r--llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll64
-rw-r--r--llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll30
2 files changed, 94 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
new file mode 100644
index 00000000000..88eca363902
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -0,0 +1,64 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+; Checks that there is no crash when there are multiple tails
+; for a the same head starting a chain.
+@0 = internal addrspace(3) global [16384 x i32] undef
+
+; CHECK-LABEL: @no_crash(
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: store i32 0
+; CHECK: store i32 0
+
+define void @no_crash(i32 %arg) {
+ %tmp2 = add i32 %arg, 14
+ %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
+ %tmp4 = add i32 %arg, 15
+ %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4
+
+ store i32 0, i32 addrspace(3)* %tmp3, align 4
+ store i32 0, i32 addrspace(3)* %tmp5, align 4
+ store i32 0, i32 addrspace(3)* %tmp5, align 4
+ store i32 0, i32 addrspace(3)* %tmp5, align 4
+
+ ret void
+}
+
+; Check adjiacent memory locations are properly matched and the
+; longest chain vectorized
+
+; CHECK-LABEL: @interleave_get_longest
+; CHECK: load <2 x i32>
+; CHECK: load i32
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: load i32
+; CHECK: load <2 x i32>
+; CHECK: load i32
+; CHECK: load i32
+
+define void @interleave_get_longest(i32 %arg) {
+ %a1 = add i32 %arg, 1
+ %a2 = add i32 %arg, 2
+ %a3 = add i32 %arg, 3
+ %a4 = add i32 %arg, 4
+ %tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg
+ %tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1
+ %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2
+ %tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3
+ %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4
+
+ %l1 = load i32, i32 addrspace(3)* %tmp2, align 4
+ %l2 = load i32, i32 addrspace(3)* %tmp1, align 4
+ store i32 0, i32 addrspace(3)* %tmp2, align 4
+ store i32 0, i32 addrspace(3)* %tmp1, align 4
+ %l3 = load i32, i32 addrspace(3)* %tmp2, align 4
+ %l4 = load i32, i32 addrspace(3)* %tmp3, align 4
+ %l5 = load i32, i32 addrspace(3)* %tmp4, align 4
+ %l6 = load i32, i32 addrspace(3)* %tmp5, align 4
+ %l7 = load i32, i32 addrspace(3)* %tmp5, align 4
+ %l8 = load i32, i32 addrspace(3)* %tmp5, align 4
+
+ ret void
+}
+
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
index 34ec43d1a66..915b94ac155 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll
@@ -85,3 +85,33 @@ define void @chain_prefix_suffix(i32* noalias %ptr) {
ret void
}
+; FIXME: If the chain is too long and TLI says misaligned is not fast,
+; then LSV fails to vectorize anything in that chain.
+; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7.
+
+; CHECK-LABEL: @interleave_get_longest
+; CHECK: load <3 x i32>
+; CHECK: load i32
+; CHECK: store <2 x i32> zeroinitializer
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+
+define void @interleave_get_longest(i32* noalias %ptr) {
+ %tmp1 = getelementptr i32, i32* %ptr, i64 0
+ %tmp2 = getelementptr i32, i32* %ptr, i64 1
+ %tmp3 = getelementptr i32, i32* %ptr, i64 2
+ %tmp4 = getelementptr i32, i32* %ptr, i64 3
+
+ %l1 = load i32, i32* %tmp2, align 4
+ %l2 = load i32, i32* %tmp1, align 4
+ store i32 0, i32* %tmp2, align 4
+ store i32 0, i32* %tmp1, align 4
+ %l3 = load i32, i32* %tmp2, align 4
+ %l4 = load i32, i32* %tmp3, align 4
+ %l5 = load i32, i32* %tmp4, align 4
+ %l6 = load i32, i32* %tmp4, align 4
+ %l7 = load i32, i32* %tmp4, align 4
+
+ ret void
+}
OpenPOWER on IntegriCloud