[SCEV] Add zext(C + x + ...) -> D + zext(C-D + x + ...)<nuw><nsw> transform

if the top level addition in (D + (C-D + x + ...)) could be proven to not wrap, where the choice of D also maximizes the number of trailing zeroes of (C-D + x + ...), ensuring homogeneous behaviour of the transformation and better canonicalization of such expressions. This enables better canonicalization of expressions like 1 + zext(5 + 20 * %x + 24 * %y) and zext(6 + 20 * %x + 24 * %y) which get both transformed to 2 + zext(4 + 20 * %x + 24 * %y) This pattern is common in address arithmetics and the transformation makes it easier for passes like LoadStoreVectorizer to prove that 2 or more memory accesses are consecutive and optimize (vectorize) them. Reviewed By: mzolotukhin Differential Revision: https://reviews.llvm.org/D48853 llvm-svn: 337859
author: Roman Tereshin <rtereshin@apple.com> 2018-07-24 21:48:56 +0000
committer: Roman Tereshin <rtereshin@apple.com> 2018-07-24 21:48:56 +0000
commit: 1ba1f9310c26507bdeaa70695e4e5529b33e842d (patch)
tree: 6778503ce98ac0f807dd0ddce31681974c9cc5f6 /llvm/test/Transforms/LoadStoreVectorizer/X86
parent: 5ddc0a2b149daf41a4df3d215555c96343326cf5 (diff)
download: bcm5719-llvm-1ba1f9310c26507bdeaa70695e4e5529b33e842d.tar.gz
bcm5719-llvm-1ba1f9310c26507bdeaa70695e4e5529b33e842d.zip
1 files changed, 78 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll b/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
new file mode 100644
index 00000000000..a9b72294d90
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
@@ -0,0 +1,78 @@
+; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s
+; RUN: opt                 -load-store-vectorizer %s -S -o - | FileCheck %s
+
+target triple = "x86_64--"
+
+%union = type { { [4 x [4 x [4 x [16 x float]]]], [4 x [4 x [4 x [16 x float]]]], [10 x [10 x [4 x float]]] } }
+
+@global_pointer = external unnamed_addr global { %union, [2000 x i8] }, align 4
+
+; Function Attrs: convergent nounwind
+define void @test(i32 %base) #0 {
+; CHECK-LABEL: @test(
+; CHECK-NOT: load i32
+; CHECK: load <2 x i32>
+; CHECK-NOT: load i32
+entry:
+  %mul331 = and i32 %base, -4
+  %add350.4 = add i32 4, %mul331
+  %idx351.4 = zext i32 %add350.4 to i64
+  %arrayidx352.4 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.4
+  %tmp296.4 = bitcast float* %arrayidx352.4 to i32*
+  %add350.5 = add i32 5, %mul331
+  %idx351.5 = zext i32 %add350.5 to i64
+  %arrayidx352.5 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.5
+  %tmp296.5 = bitcast float* %arrayidx352.5 to i32*
+  %cnd = icmp ult i32 %base, 1000
+  br i1 %cnd, label %loads, label %exit
+
+loads:
+  ; If and only if the loads are in a different BB from the GEPs codegenprepare
+  ; would try to turn the GEPs into math, which makes LoadStoreVectorizer's job
+  ; harder
+  %tmp297.4 = load i32, i32* %tmp296.4, align 4, !tbaa !0
+  %tmp297.5 = load i32, i32* %tmp296.5, align 4, !tbaa !0
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+define void @test.codegenprepared(i32 %base) #0 {
+; CHECK-LABEL: @test.codegenprepared(
+; CHECK-NOT: load i32
+; CHECK: load <2 x i32>
+; CHECK-NOT: load i32
+entry:
+  %mul331 = and i32 %base, -4
+  %add350.4 = add i32 4, %mul331
+  %idx351.4 = zext i32 %add350.4 to i64
+  %add350.5 = add i32 5, %mul331
+  %idx351.5 = zext i32 %add350.5 to i64
+  %cnd = icmp ult i32 %base, 1000
+  br i1 %cnd, label %loads, label %exit
+
+loads:                                            ; preds = %entry
+  %sunkaddr = mul i64 %idx351.4, 4
+  %sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr
+  %sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096
+  %0 = bitcast i8* %sunkaddr2 to i32*
+  %tmp297.4 = load i32, i32* %0, align 4, !tbaa !0
+  %sunkaddr3 = mul i64 %idx351.5, 4
+  %sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr3
+  %sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096
+  %1 = bitcast i8* %sunkaddr5 to i32*
+  %tmp297.5 = load i32, i32* %1, align 4, !tbaa !0
+  br label %exit
+
+exit:                                             ; preds = %loads, %entry
+  ret void
+}
+
+attributes #0 = { convergent nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}
author	Roman Tereshin <rtereshin@apple.com>	2018-07-24 21:48:56 +0000
committer	Roman Tereshin <rtereshin@apple.com>	2018-07-24 21:48:56 +0000
commit	1ba1f9310c26507bdeaa70695e4e5529b33e842d (patch)
tree	6778503ce98ac0f807dd0ddce31681974c9cc5f6 /llvm/test/Transforms/LoadStoreVectorizer/X86
parent	5ddc0a2b149daf41a4df3d215555c96343326cf5 (diff)
download	bcm5719-llvm-1ba1f9310c26507bdeaa70695e4e5529b33e842d.tar.gz bcm5719-llvm-1ba1f9310c26507bdeaa70695e4e5529b33e842d.zip