[ppc] Correctly compute the cost of loading 32/64 bit memory into VSR

VSX has instructions lxsiwax/lxsdx that can load 32/64 bit value into VSX register cheaply. That patch makes it known to memory cost model, so the vectorization of the test case in pr30990 is beneficial. Differential Revision: https://reviews.llvm.org/D26713 llvm-svn: 288560
author: Guozhi Wei <carrot@google.com> 2016-12-03 00:41:43 +0000
committer: Guozhi Wei <carrot@google.com> 2016-12-03 00:41:43 +0000
commit: 835de1f3ab2df5ce56eab9e5d9021a26fe04b854 (patch)
tree: b7942e7efd680008e726203f88d8a5e394b670e2 /llvm/test
parent: 33f947057dff866c3768835dd111dac6613c31d7 (diff)
download: bcm5719-llvm-835de1f3ab2df5ce56eab9e5d9021a26fe04b854.tar.gz
bcm5719-llvm-835de1f3ab2df5ce56eab9e5d9021a26fe04b854.zip
2 files changed, 159 insertions, 0 deletions
diff --git a/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll b/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
new file mode 100644
index 00000000000..4afeabca00a
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @loads(i32 %arg) {
+  ; CHECK: cost of 1 {{.*}} load
+  load <4 x i8>, <4 x i8>* undef, align 1
+
+  ; CHECK: cost of 1 {{.*}} load
+  load <8 x i8>, <8 x i8>* undef, align 1
+
+  ; CHECK: cost of 1 {{.*}} load
+  load <2 x i16>, <2 x i16>* undef, align 2
+
+  ; CHECK: cost of 1 {{.*}} load
+  load <4 x i16>, <4 x i16>* undef, align 2
+
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
new file mode 100644
index 00000000000..d3cdabd26f5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
@@ -0,0 +1,140 @@
+; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S | FileCheck %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+define signext i32 @foo(i8* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64
+  %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i8, i8* %ptr.addr.08, align 1
+  %cmp1 = icmp slt i8 %0, -64
+  %cond = zext i1 %cmp1 to i32
+  %add = add nsw i32 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
+  %cmp = icmp ult i8* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i32 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i32 %count.0.lcssa
+
+; CHECK: load <4 x i8>
+; CHECK: icmp slt <4 x i8>
+}
+
+
+define signext i16 @foo2(i8* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64 
+  %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i16 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i8, i8* %ptr.addr.08, align 1
+  %cmp1 = icmp slt i8 %0, -64 
+  %cond = zext i1 %cmp1 to i16 
+  %add = add nsw i16 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
+  %cmp = icmp ult i8* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i16 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i16 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i16 %count.0.lcssa
+
+; CHECK-LABEL: foo2
+; CHECK: load <8 x i8>
+; CHECK: icmp slt <8 x i8>
+}
+
+define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64 
+  %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i16, i16* %ptr.addr.16, align 1
+  %cmp1 = icmp slt i16 %0, -64 
+  %cond = zext i1 %cmp1 to i32 
+  %add = add nsw i32 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
+  %cmp = icmp ult i16* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i32 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i32 %count.0.lcssa
+
+; CHECK-LABEL: foo3
+; CHECK: load <4 x i16>
+; CHECK: icmp slt <4 x i16>
+}
+
+define i64 @foo4(i16* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64 
+  %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i16, i16* %ptr.addr.16, align 1
+  %cmp1 = icmp slt i16 %0, -64 
+  %cond = zext i1 %cmp1 to i64 
+  %add = add nsw i64 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
+  %cmp = icmp ult i16* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i64 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i64 %count.0.lcssa
+
+; CHECK-LABEL: foo4
+; CHECK: load <2 x i16>
+; CHECK: icmp slt <2 x i16>
+}
+
author	Guozhi Wei <carrot@google.com>	2016-12-03 00:41:43 +0000
committer	Guozhi Wei <carrot@google.com>	2016-12-03 00:41:43 +0000
commit	835de1f3ab2df5ce56eab9e5d9021a26fe04b854 (patch)
tree	b7942e7efd680008e726203f88d8a5e394b670e2 /llvm/test
parent	33f947057dff866c3768835dd111dac6613c31d7 (diff)
download	bcm5719-llvm-835de1f3ab2df5ce56eab9e5d9021a26fe04b854.tar.gz bcm5719-llvm-835de1f3ab2df5ce56eab9e5d9021a26fe04b854.zip