diff options
author | Alina Sbirlea <asbirlea@google.com> | 2016-08-30 23:53:59 +0000 |
---|---|---|
committer | Alina Sbirlea <asbirlea@google.com> | 2016-08-30 23:53:59 +0000 |
commit | 3f8f7840bf12ffa4bfd558e5115acbd66b39280a (patch) | |
tree | a20c66b655f152f81902a1037d847e83c3a3402f /llvm/test/Transforms/LoadStoreVectorizer | |
parent | fdb32d566a5e81deb9a3e4c0714f74337edcfbb7 (diff) | |
download | bcm5719-llvm-3f8f7840bf12ffa4bfd558e5115acbd66b39280a.tar.gz bcm5719-llvm-3f8f7840bf12ffa4bfd558e5115acbd66b39280a.zip |
[LoadStoreVectorizer] Change VectorSet to Vector to match head and tail positions. Resolves PR29148.
Summary:
LSV was using two vector sets (heads and tails) to track pairs of adjiacent position to vectorize.
A recent optimization is trying to obtain the longest chain to vectorize and assumes the positions
in heads(H) and tails(T) match, which is not the case is there are multiple tails for the same head.
e.g.:
i1: store a[0]
i2: store a[1]
i3: store a[1]
Leads to:
H: i1
T: i2 i3
Instead of:
H: i1 i1
T: i2 i3
So the positions for instructions that follow i3 will have different indexes in H/T.
This patch resolves PR29148.
This issue also surfaced the fact that if the chain is too long, and TLI
returns a "not-fast" answer, the whole chain will be abandoned for
vectorization, even though a smaller one would be beneficial.
Added a testcase and FIXME for this.
Reviewers: tstellarAMD, arsenm, jlebar
Subscribers: mzolotukhin, wdng, llvm-commits
Differential Revision: https://reviews.llvm.org/D24057
llvm-svn: 280179
Diffstat (limited to 'llvm/test/Transforms/LoadStoreVectorizer')
-rw-r--r-- | llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll | 64 | ||||
-rw-r--r-- | llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll | 30 |
2 files changed, 94 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll new file mode 100644 index 00000000000..88eca363902 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -0,0 +1,64 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + +; Checks that there is no crash when there are multiple tails +; for a the same head starting a chain. +@0 = internal addrspace(3) global [16384 x i32] undef + +; CHECK-LABEL: @no_crash( +; CHECK: store <2 x i32> zeroinitializer +; CHECK: store i32 0 +; CHECK: store i32 0 + +define void @no_crash(i32 %arg) { + %tmp2 = add i32 %arg, 14 + %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2 + %tmp4 = add i32 %arg, 15 + %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4 + + store i32 0, i32 addrspace(3)* %tmp3, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + + ret void +} + +; Check adjiacent memory locations are properly matched and the +; longest chain vectorized + +; CHECK-LABEL: @interleave_get_longest +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load i32 +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: load i32 + +define void @interleave_get_longest(i32 %arg) { + %a1 = add i32 %arg, 1 + %a2 = add i32 %arg, 2 + %a3 = add i32 %arg, 3 + %a4 = add i32 %arg, 4 + %tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg + %tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1 + %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2 + %tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3 + %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4 + + %l1 = load i32, i32 addrspace(3)* %tmp2, align 4 + %l2 = load i32, i32 addrspace(3)* %tmp1, align 4 + store i32 0, i32 addrspace(3)* %tmp2, align 4 + store i32 0, i32 addrspace(3)* %tmp1, align 4 + %l3 = load i32, i32 addrspace(3)* %tmp2, align 4 + %l4 = load i32, i32 addrspace(3)* %tmp3, align 4 + %l5 = load i32, i32 addrspace(3)* %tmp4, align 4 + %l6 = load i32, i32 addrspace(3)* %tmp5, align 4 + %l7 = load i32, i32 addrspace(3)* %tmp5, align 4 + %l8 = load i32, i32 addrspace(3)* %tmp5, align 4 + + ret void +} + diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll index 34ec43d1a66..915b94ac155 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -85,3 +85,33 @@ define void @chain_prefix_suffix(i32* noalias %ptr) { ret void } +; FIXME: If the chain is too long and TLI says misaligned is not fast, +; then LSV fails to vectorize anything in that chain. +; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7. + +; CHECK-LABEL: @interleave_get_longest +; CHECK: load <3 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 + +define void @interleave_get_longest(i32* noalias %ptr) { + %tmp1 = getelementptr i32, i32* %ptr, i64 0 + %tmp2 = getelementptr i32, i32* %ptr, i64 1 + %tmp3 = getelementptr i32, i32* %ptr, i64 2 + %tmp4 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %tmp2, align 4 + %l2 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp2, align 4 + store i32 0, i32* %tmp1, align 4 + %l3 = load i32, i32* %tmp2, align 4 + %l4 = load i32, i32* %tmp3, align 4 + %l5 = load i32, i32* %tmp4, align 4 + %l6 = load i32, i32* %tmp4, align 4 + %l7 = load i32, i32* %tmp4, align 4 + + ret void +} |