[LSV] Look through selects for consecutive addresses

In some cases LSV sees (load/store _ (select _ <pointer expression> <pointer expression>)) patterns in input IR, often due to sinking and other forms of CFG simplification, sometimes interspersed with bitcasts and all-constant-indices GEPs. With this patch`areConsecutivePointers` method would attempt to handle select instructions. This leads to an increased number of successful vectorizations. Technically, select instructions could appear in index arithmetic as well, however, we don't see those in our test suites / benchmarks. Also, there is a lot more freedom in IR shapes computing integral indices in general than in what's common in pointer computations, and it appears that it's quite unreliable to do anything short of making select instructions first class citizens of Scalar Evolution, which for the purposes of this patch is most definitely an overkill. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D49428 llvm-svn: 337965
author: Roman Tereshin <rtereshin@apple.com> 2018-07-25 21:33:00 +0000
committer: Roman Tereshin <rtereshin@apple.com> 2018-07-25 21:33:00 +0000
commit: 4f10a9d3a3a47cd60e5760ba4434a566054d9562 (patch)
tree: 48697880a0edbe18711bbfe8cedf78e7ede941d8 /llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
parent: f94c4c84e6b3f74a615db16a67c333485d160777 (diff)
download: bcm5719-llvm-4f10a9d3a3a47cd60e5760ba4434a566054d9562.tar.gz
bcm5719-llvm-4f10a9d3a3a47cd60e5760ba4434a566054d9562.zip
1 files changed, 95 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
new file mode 100644
index 00000000000..32fe5eb9ce2
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
@@ -0,0 +1,95 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -load-store-vectorizer -dce -S -o - %s | FileCheck %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+define void @base_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, <3 x i32> addrspace(1)* %out) {
+; CHECK-LABEL: @base_case
+; CHECK: load <3 x i32>
+entry:
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 1
+  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 2
+  %gep4 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 1
+  %gep5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 2
+  %selected = select i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b
+  %selected14 = select i1 %cnd, i32 addrspace(1)* %gep1, i32 addrspace(1)* %gep4
+  %selected25 = select i1 %cnd, i32 addrspace(1)* %gep2, i32 addrspace(1)* %gep5
+  %val0 = load i32, i32 addrspace(1)* %selected, align 4
+  %val1 = load i32, i32 addrspace(1)* %selected14, align 4
+  %val2 = load i32, i32 addrspace(1)* %selected25, align 4
+  %t0 = insertelement <3 x i32> undef, i32 %val0, i32 0
+  %t1 = insertelement <3 x i32> %t0, i32 %val1, i32 1
+  %t2 = insertelement <3 x i32> %t1, i32 %val2, i32 2
+  store <3 x i32> %t2, <3 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @scev_targeting_complex_case(i1 %cnd, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) {
+; CHECK-LABEL: @scev_targeting_complex_case
+; CHECK: load <2 x i32>
+entry:
+  %base.x4 = shl i32 %base, 2
+  %base.x4.p1 = add i32 %base.x4, 1
+  %base.x4.p2 = add i32 %base.x4, 2
+  %base.x4.p3 = add i32 %base.x4, 3
+  %zext.x4 = zext i32 %base.x4 to i64
+  %zext.x4.p1 = zext i32 %base.x4.p1 to i64
+  %zext.x4.p2 = zext i32 %base.x4.p2 to i64
+  %zext.x4.p3 = zext i32 %base.x4.p3 to i64
+  %base.x16 = mul i64 %zext.x4, 4
+  %base.x16.p4 = shl i64 %zext.x4.p1, 2
+  %base.x16.p8 = shl i64 %zext.x4.p2, 2
+  %base.x16.p12 = mul i64 %zext.x4.p3, 4
+  %a.pi8 = bitcast i32 addrspace(1)* %a to i8 addrspace(1)*
+  %b.pi8 = bitcast i32 addrspace(1)* %b to i8 addrspace(1)*
+  %gep.a.base.x16 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16
+  %gep.b.base.x16.p4 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p4
+  %gep.a.base.x16.p8 = getelementptr inbounds i8, i8 addrspace(1)* %a.pi8, i64 %base.x16.p8
+  %gep.b.base.x16.p12 = getelementptr inbounds i8, i8 addrspace(1)* %b.pi8, i64 %base.x16.p12
+  %a.base.x16 = bitcast i8 addrspace(1)* %gep.a.base.x16 to i32 addrspace(1)*
+  %b.base.x16.p4 = bitcast i8 addrspace(1)* %gep.b.base.x16.p4 to i32 addrspace(1)*
+  %selected.base.x16.p0.or.4 = select i1 %cnd, i32 addrspace(1)* %a.base.x16, i32 addrspace(1)* %b.base.x16.p4
+  %gep.selected.base.x16.p8.or.12 = select i1 %cnd, i8 addrspace(1)* %gep.a.base.x16.p8, i8 addrspace(1)* %gep.b.base.x16.p12
+  %selected.base.x16.p8.or.12 = bitcast i8 addrspace(1)* %gep.selected.base.x16.p8.or.12 to i32 addrspace(1)*
+  %selected.base.x16.p40.or.44 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p0.or.4, i64 10
+  %selected.base.x16.p44.or.48 = getelementptr inbounds i32, i32 addrspace(1)* %selected.base.x16.p8.or.12, i64 9
+  %val0 = load i32, i32 addrspace(1)* %selected.base.x16.p40.or.44, align 4
+  %val1 = load i32, i32 addrspace(1)* %selected.base.x16.p44.or.48, align 4
+  %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1
+  store <2 x i32> %t1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @nested_selects(i1 %cnd0, i1 %cnd1, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %base, <2 x i32> addrspace(1)* %out) {
+; CHECK-LABEL: @nested_selects
+; CHECK: load <2 x i32>
+entry:
+  %base.p1 = add nsw i32 %base, 1
+  %base.p2 = add i32 %base, 2
+  %base.p3 = add nsw i32 %base, 3
+  %base.x4 = mul i32 %base, 4
+  %base.x4.p5 = add i32 %base.x4, 5
+  %base.x4.p6 = add i32 %base.x4, 6
+  %sext = sext i32 %base to i64
+  %sext.p1 = sext i32 %base.p1 to i64
+  %sext.p2 = sext i32 %base.p2 to i64
+  %sext.p3 = sext i32 %base.p3 to i64
+  %sext.x4.p5 = sext i32 %base.x4.p5 to i64
+  %sext.x4.p6 = sext i32 %base.x4.p6 to i64
+  %gep.a.base = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext
+  %gep.a.base.p1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p1
+  %gep.a.base.p2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p2
+  %gep.a.base.p3 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.p3
+  %gep.b.base.x4.p5 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p5
+  %gep.b.base.x4.p6 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %sext.x4.p6
+  %selected.1.L = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p2, i32 addrspace(1)* %gep.b.base.x4.p5
+  %selected.1.R = select i1 %cnd1, i32 addrspace(1)* %gep.a.base.p3, i32 addrspace(1)* %gep.b.base.x4.p6
+  %selected.0.L = select i1 %cnd0, i32 addrspace(1)* %gep.a.base, i32 addrspace(1)* %selected.1.L
+  %selected.0.R = select i1 %cnd0, i32 addrspace(1)* %gep.a.base.p1, i32 addrspace(1)* %selected.1.R
+  %val0 = load i32, i32 addrspace(1)* %selected.0.L, align 4
+  %val1 = load i32, i32 addrspace(1)* %selected.0.R, align 4
+  %t0 = insertelement <2 x i32> undef, i32 %val0, i32 0
+  %t1 = insertelement <2 x i32> %t0, i32 %val1, i32 1
+  store <2 x i32> %t1, <2 x i32> addrspace(1)* %out
+  ret void
+}
author	Roman Tereshin <rtereshin@apple.com>	2018-07-25 21:33:00 +0000
committer	Roman Tereshin <rtereshin@apple.com>	2018-07-25 21:33:00 +0000
commit	4f10a9d3a3a47cd60e5760ba4434a566054d9562 (patch)
tree	48697880a0edbe18711bbfe8cedf78e7ede941d8 /llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/selects.ll
parent	f94c4c84e6b3f74a615db16a67c333485d160777 (diff)
download	bcm5719-llvm-4f10a9d3a3a47cd60e5760ba4434a566054d9562.tar.gz bcm5719-llvm-4f10a9d3a3a47cd60e5760ba4434a566054d9562.zip