[IAI,LV] Add support for vectorizing predicated strided accesses using masked

interleave-group The vectorizer currently does not attempt to create interleave-groups that contain predicated loads/stores; predicated strided accesses can currently be vectorized only using masked gather/scatter or scalarization. This patch makes predicated loads/stores candidates for forming interleave-groups during the Loop-Vectorizer's analysis, and adds the proper support for masked-interleave- groups to the Loop-Vectorizer's planning and transformation stages. The patch also extends the TTI API to allow querying the cost of masked interleave groups (which each target can control); Targets that support masked vector loads/ stores may choose to enable this feature and allow vectorizing predicated strided loads/stores using masked wide loads/stores and shuffles. Reviewers: Ayal, hsaito, dcaballe, fhahn, javed.absar Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D53011 llvm-svn: 344472
author: Dorit Nuzman <dorit.nuzman@intel.com> 2018-10-14 07:06:16 +0000
committer: Dorit Nuzman <dorit.nuzman@intel.com> 2018-10-14 07:06:16 +0000
commit: 8174368955177c0765977996b00a0184921d5420 (patch)
tree: 68ae87abe77ac7844dd6dae68522339840ebf075 /llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
parent: 20fa085d74336f1f5801aa53d47039adbf116a82 (diff)
download: bcm5719-llvm-8174368955177c0765977996b00a0184921d5420.tar.gz
bcm5719-llvm-8174368955177c0765977996b00a0184921d5420.zip
1 files changed, 222 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..9ed66a22dbf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
@@ -0,0 +1,222 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; We test here that the loop-vectorizer forms an interleave-groups from 
+; predicated memory accesses only if they are both in the same (predicated)
+; block (first scenario below).
+; If the accesses are not in the same predicated block, an interleave-group
+; is not formed (scenarios 2,3 below).
+
+; Scenario 1: Check the case where it is legal to create masked interleave-
+; groups. Altogether two groups are created (one for loads and one for stores)
+; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses
+; are disabled we do not create any interleave-group.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  store i8  %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:   %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with   %{{.*}} = load i8, i8* %{{.*}}, align 1
+
+; Scenario 2: Check the case where it is illegal to create a masked interleave-
+; group because the first access is predicated, and the second isn't.
+; We therefore create a separate interleave-group with gaps for each of the
+; stores (if masked-interleaved-accesses are enabled) and these are later
+; invalidated because interleave-groups of stores with gaps are not supported. 
+; If masked-interleaved-accesses is not enabled we create only one interleave
+; group of stores (for the non-predicated store) and it is later invalidated
+; due to gaps.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     q[2*ix+1] = 2;
+; }
+;}
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; Scenario 3: Check the case where it is illegal to create a masked interleave-
+; group because the two accesses are in separate predicated blocks.
+; We therefore create a separate interleave-group with gaps for each of the accesses,
+; (which are later invalidated because interleave-groups of stores with gaps are 
+; not supported).
+; If masked-interleaved-accesses is not enabled we don't create any interleave
+; group because all accesses are predicated.
+;
+; void masked_strided3(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     if (ix > guard2) {
+;         q[2*ix+1] = 2;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.012, 1
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  %cmp1 = icmp ugt i32 %ix.012, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.012, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard1 to i32
+  %conv3 = zext i8 %guard2 to i32
+  br label %for.body
+
+for.body:
+  %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.018, 1
+  %cmp1 = icmp ugt i32 %ix.018, %conv
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  br label %if.end
+
+if.end:
+  %cmp4 = icmp ugt i32 %ix.018, %conv3
+  br i1 %cmp4, label %if.then6, label %for.inc
+
+if.then6:
+  %add = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx7, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.018, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = {  "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"  }
author	Dorit Nuzman <dorit.nuzman@intel.com>	2018-10-14 07:06:16 +0000
committer	Dorit Nuzman <dorit.nuzman@intel.com>	2018-10-14 07:06:16 +0000
commit	8174368955177c0765977996b00a0184921d5420 (patch)
tree	68ae87abe77ac7844dd6dae68522339840ebf075 /llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
parent	20fa085d74336f1f5801aa53d47039adbf116a82 (diff)
download	bcm5719-llvm-8174368955177c0765977996b00a0184921d5420.tar.gz bcm5719-llvm-8174368955177c0765977996b00a0184921d5420.zip