summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
diff options
context:
space:
mode:
authorMohammad Shahid <Asghar-ahmad.Shahid@amd.com>2017-09-20 08:18:28 +0000
committerMohammad Shahid <Asghar-ahmad.Shahid@amd.com>2017-09-20 08:18:28 +0000
commitf8db9bd857916c132bb943a2784475f5ca00e92f (patch)
treec48c00bf5ff1c0fe97c3cf81e32895d02b67431d /llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
parent92980ce6aacc04495f26c6b2cb51652ac68ae43b (diff)
downloadbcm5719-llvm-f8db9bd857916c132bb943a2784475f5ca00e92f.tar.gz
bcm5719-llvm-f8db9bd857916c132bb943a2784475f5ca00e92f.zip
[SLP] Vectorize jumbled memory loads.
Summary: This patch tries to vectorize loads of consecutive memory accesses, accessed in non-consecutive or jumbled way. An earlier attempt was made with patch D26905 which was reverted back due to some basic issue with representing the 'use mask' of jumbled accesses. This patch fixes the mask representation by recording the 'use mask' in the usertree entry. Change-Id: I9fe7f5045f065d84c126fa307ef6ebe0787296df Reviewers: mkuper, loladiro, Ayal, zvi, danielcdh Reviewed By: Ayal Subscribers: mzolotukhin Differential Revision: https://reviews.llvm.org/D36130 Commit after rebase for patch D36130 Change-Id: I8add1c265455669ef288d880f870a9522c8c08ab llvm-svn: 313736
Diffstat (limited to 'llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll')
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll68
1 files changed, 68 insertions, 0 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
new file mode 100644
index 00000000000..cddb6d7076d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+
+;void jumble (int * restrict A, int * restrict B) {
+ ; int tmp0 = A[10]*A[0];
+ ; int tmp1 = A[11]*A[1];
+ ; int tmp2 = A[12]*A[3];
+ ; int tmp3 = A[13]*A[2];
+ ; B[0] = tmp0;
+ ; B[1] = tmp1;
+ ; B[2] = tmp2;
+ ; B[3] = tmp3;
+ ;}
+ ; Function Attrs: norecurse nounwind uwtable
+ define void @jumble(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @jumble(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP1]]
+; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
+ %0 = load i32, i32* %arrayidx, align 4
+ %1 = load i32, i32* %A, align 4
+ %mul = mul nsw i32 %1, %0
+ %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
+ %2 = load i32, i32* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
+ %3 = load i32, i32* %arrayidx3, align 4
+ %mul4 = mul nsw i32 %3, %2
+ %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
+ %4 = load i32, i32* %arrayidx5, align 4
+ %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
+ %5 = load i32, i32* %arrayidx6, align 4
+ %mul7 = mul nsw i32 %5, %4
+ %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
+ %6 = load i32, i32* %arrayidx8, align 4
+ %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
+ %7 = load i32, i32* %arrayidx9, align 4
+ %mul10 = mul nsw i32 %7, %6
+ store i32 %mul, i32* %B, align 4
+ %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
+ store i32 %mul4, i32* %arrayidx12, align 4
+ %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
+ store i32 %mul7, i32* %arrayidx13, align 4
+ %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
+ store i32 %mul10, i32* %arrayidx14, align 4
+ ret void
+ }
+
OpenPOWER on IntegriCloud