summaryrefslogtreecommitdiffstats
path: root/llvm/test
diff options
context:
space:
mode:
authorDorit Nuzman <dorit.nuzman@intel.com>2018-10-14 07:06:16 +0000
committerDorit Nuzman <dorit.nuzman@intel.com>2018-10-14 07:06:16 +0000
commit8174368955177c0765977996b00a0184921d5420 (patch)
tree68ae87abe77ac7844dd6dae68522339840ebf075 /llvm/test
parent20fa085d74336f1f5801aa53d47039adbf116a82 (diff)
downloadbcm5719-llvm-8174368955177c0765977996b00a0184921d5420.tar.gz
bcm5719-llvm-8174368955177c0765977996b00a0184921d5420.zip
[IAI,LV] Add support for vectorizing predicated strided accesses using masked
interleave-group The vectorizer currently does not attempt to create interleave-groups that contain predicated loads/stores; predicated strided accesses can currently be vectorized only using masked gather/scatter or scalarization. This patch makes predicated loads/stores candidates for forming interleave-groups during the Loop-Vectorizer's analysis, and adds the proper support for masked-interleave- groups to the Loop-Vectorizer's planning and transformation stages. The patch also extends the TTI API to allow querying the cost of masked interleave groups (which each target can control); Targets that support masked vector loads/ stores may choose to enable this feature and allow vectorizing predicated strided loads/stores using masked wide loads/stores and shuffles. Reviewers: Ayal, hsaito, dcaballe, fhahn, javed.absar Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D53011 llvm-svn: 344472
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll164
-rw-r--r--llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll222
-rw-r--r--llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll1
3 files changed, 387 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..b1163d0a199
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -0,0 +1,164 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; When masked-interleaved-groups are disabled:
+; Check that the predicated load is not vectorized as an
+; interleaved-group but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load for an interleave-group (with
+; a single member).
+;
+; void masked_strided1(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard) {
+; char t = p[2*ix];
+; q[ix] = t;
+; }
+; }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32
+;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+ %conv = zext i8 %guard to i32
+ br label %for.body
+
+for.body:
+ %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp1 = icmp ugt i32 %ix.09, %conv
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+ %mul = shl nuw nsw i32 %ix.09, 1
+ %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+ %0 = load i8, i8* %arrayidx, align 1
+ %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+ store i8 %0, i8* %arrayidx3, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.09, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; Check also a scenario with full interleave-groups (no gaps) as well as both
+; load and store groups. We check that when masked-interleave-group is disabled
+; the predicated loads (and stores) are not vectorized as an
+; interleaved-group but rather as four separate scalarized accesses.
+; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load/store for the two interleave-
+; groups.
+;
+; void masked_strided2(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard) {
+; char left = p[2*ix];
+; char right = p[2*ix + 1];
+; char max = max(left, right);
+; q[2*ix] = max;
+; q[2*ix+1] = 0 - max;
+; }
+; }
+;}
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32
+;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+;ENABLED_MASKED_STRIDED: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+ %conv = zext i8 %guard to i32
+ br label %for.body
+
+for.body:
+ %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp1 = icmp ugt i32 %ix.024, %conv
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+ %mul = shl nuw nsw i32 %ix.024, 1
+ %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+ %0 = load i8, i8* %arrayidx, align 1
+ %add = or i32 %mul, 1
+ %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+ %1 = load i8, i8* %arrayidx4, align 1
+ %cmp.i = icmp slt i8 %0, %1
+ %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+ %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+ store i8 %spec.select.i, i8* %arrayidx6, align 1
+ %sub = sub i8 0, %spec.select.i
+ %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+ store i8 %sub, i8* %arrayidx11, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.024, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..9ed66a22dbf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
@@ -0,0 +1,222 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; We test here that the loop-vectorizer forms an interleave-groups from
+; predicated memory accesses only if they are both in the same (predicated)
+; block (first scenario below).
+; If the accesses are not in the same predicated block, an interleave-group
+; is not formed (scenarios 2,3 below).
+
+; Scenario 1: Check the case where it is legal to create masked interleave-
+; groups. Altogether two groups are created (one for loads and one for stores)
+; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses
+; are disabled we do not create any interleave-group.
+;
+; void masked_strided1(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard) {
+; char left = p[2*ix];
+; char right = p[2*ix + 1];
+; char max = max(left, right);
+; q[2*ix] = max;
+; q[2*ix+1] = 0 - max;
+; }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1"
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided1"
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted: store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: into the interleave group with store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted: %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: into the interleave group with %{{.*}} = load i8, i8* %{{.*}}, align 1
+
+; Scenario 2: Check the case where it is illegal to create a masked interleave-
+; group because the first access is predicated, and the second isn't.
+; We therefore create a separate interleave-group with gaps for each of the
+; stores (if masked-interleaved-accesses are enabled) and these are later
+; invalidated because interleave-groups of stores with gaps are not supported.
+; If masked-interleaved-accesses is not enabled we create only one interleave
+; group of stores (for the non-predicated store) and it is later invalidated
+; due to gaps.
+;
+; void masked_strided2(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard1,
+; unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard1) {
+; q[2*ix] = 1;
+; }
+; q[2*ix+1] = 2;
+; }
+;}
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2"
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1
+; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided2"
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; Scenario 3: Check the case where it is illegal to create a masked interleave-
+; group because the two accesses are in separate predicated blocks.
+; We therefore create a separate interleave-group with gaps for each of the accesses,
+; (which are later invalidated because interleave-groups of stores with gaps are
+; not supported).
+; If masked-interleaved-accesses is not enabled we don't create any interleave
+; group because all accesses are predicated.
+;
+; void masked_strided3(const unsigned char* restrict p,
+; unsigned char* restrict q,
+; unsigned char guard1,
+; unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+; if (ix > guard1) {
+; q[2*ix] = 1;
+; }
+; if (ix > guard2) {
+; q[2*ix+1] = 2;
+; }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3"
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided3"
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+ %conv = zext i8 %guard to i32
+ br label %for.body
+
+for.body:
+ %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp1 = icmp ugt i32 %ix.024, %conv
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+ %mul = shl nuw nsw i32 %ix.024, 1
+ %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+ %0 = load i8, i8* %arrayidx, align 1
+ %add = or i32 %mul, 1
+ %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+ %1 = load i8, i8* %arrayidx4, align 1
+ %cmp.i = icmp slt i8 %0, %1
+ %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+ %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+ store i8 %spec.select.i, i8* %arrayidx6, align 1
+ %sub = sub i8 0, %spec.select.i
+ %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+ store i8 %sub, i8* %arrayidx11, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.024, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+
+define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+ %conv = zext i8 %guard to i32
+ br label %for.body
+
+for.body:
+ %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %mul = shl nuw nsw i32 %ix.012, 1
+ %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+ store i8 1, i8* %arrayidx, align 1
+ %cmp1 = icmp ugt i32 %ix.012, %conv
+ br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+ %add = or i32 %mul, 1
+ %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add
+ store i8 2, i8* %arrayidx3, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.012, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+
+define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 {
+entry:
+ %conv = zext i8 %guard1 to i32
+ %conv3 = zext i8 %guard2 to i32
+ br label %for.body
+
+for.body:
+ %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %mul = shl nuw nsw i32 %ix.018, 1
+ %cmp1 = icmp ugt i32 %ix.018, %conv
+ br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+ %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+ store i8 1, i8* %arrayidx, align 1
+ br label %if.end
+
+if.end:
+ %cmp4 = icmp ugt i32 %ix.018, %conv3
+ br i1 %cmp4, label %if.then6, label %for.inc
+
+if.then6:
+ %add = or i32 %mul, 1
+ %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add
+ store i8 2, i8* %arrayidx7, align 1
+ br label %for.inc
+
+for.inc:
+ %inc = add nuw nsw i32 %ix.018, 1
+ %exitcond = icmp eq i32 %inc, 1024
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+attributes #0 = { "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 89c0ac10916..c647f586b18 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -1,4 +1,5 @@
; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
%pair = type { i64, i64 }
OpenPOWER on IntegriCloud