[IAI,LV] Avoid creating a scalar epilogue due to gaps in interleave-groups when

optimizing for size LV is careful to respect -Os and not to create a scalar epilog in all cases (runtime tests, trip-counts that require a remainder loop) except for peeling due to gaps in interleave-groups. This patch fixes that; -Os will now have us invalidate such interleave-groups and vectorize without an epilog. The patch also removes a related FIXME comment that is now obsolete, and was also inaccurate: "FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a smaller MaxVF that does not require a scalar epilog." (requiresScalarEpilog() has nothing to do with VF). Reviewers: Ayal, hsaito, dcaballe, fhahn Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D53420 llvm-svn: 344883
author: Dorit Nuzman <dorit.nuzman@intel.com> 2018-10-22 06:17:09 +0000
committer: Dorit Nuzman <dorit.nuzman@intel.com> 2018-10-22 06:17:09 +0000
commit: 3ec99fe21bbc44d0a3ff898644f71aa2e1e8d6ef (patch)
tree: 611aadca17d90c373ca52fd74c2c9300313ac141 /llvm/test/Transforms/LoopVectorize/X86
parent: 2336dc3c51c8883a1ef171a4236c448b54f6993c (diff)
download: bcm5719-llvm-3ec99fe21bbc44d0a3ff898644f71aa2e1e8d6ef.tar.gz
bcm5719-llvm-3ec99fe21bbc44d0a3ff898644f71aa2e1e8d6ef.zip
1 files changed, 112 insertions, 2 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index b1163d0a199..61a2e2ca003 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
-; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
 
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
@@ -9,9 +9,13 @@ target triple = "i386-unknown-linux-gnu"
 ; interleaved-group but rather as a scalarized accesses.
 ; (For SKX, Gather is not supported by the compiler for chars, therefore
 ;  the only remaining alternative is to scalarize).
+; In this case a scalar epilogue is not needed.
+;
 ; When  masked-interleave-group is enabled we expect to find the proper mask
 ; shuffling code, feeding the wide masked load for an interleave-group (with
 ; a single member).
+; Since the last (second) member of the load-group is a gap, peeling is used,
+; so we also expect to find a scalar epilogue loop.
 ;
 ; void masked_strided1(const unsigned char* restrict p,
 ;                      unsigned char* restrict q,
@@ -38,6 +42,8 @@ target triple = "i386-unknown-linux-gnu"
 ;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
 ;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
 ;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
 
 ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
 ;ENABLED_MASKED_STRIDED: vector.body:
@@ -47,6 +53,7 @@ target triple = "i386-unknown-linux-gnu"
 ;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
 ;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
 ;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED: for.body:
 
 define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
 entry:
@@ -75,6 +82,109 @@ for.end:
   ret void
 }
 
+; Exactly the same scenario except we are now optimizing for size, therefore
+; we check that no scalar epilogue is created. Since we can't create an epilog
+; the interleave-group is invalidated because is has gaps, so we end up
+; scalarizing.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
+;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;ENABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;ENABLED_MASKED_STRIDED-NOT:   %interleaved.mask = 
+;ENABLED_MASKED_STRIDED-NOT:   call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;ENABLED_MASKED_STRIDED-NOT: for.body:
+;ENABLED_MASKED_STRIDED:     for.end:
+
+define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Same, but the load/store are not predicated. The interleave-group is
+; invalidated here as well because we have gaps and we can't create an epilog.
+; The access is thus scalarized.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+; Since enable-masked-interleaved-accesses currently only affects predicated
+; accesses, the behavior is the same with this switch set/unset.
+
+
+; void unconditional_strided1_optsize(const unsigned char* restrict p,
+;                                unsigned char* restrict q,
+;                                unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;DISABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;ENABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;ENABLED_MASKED_STRIDED-NOT: for.body:
+;ENABLED_MASKED_STRIDED:     for.end:
+
+define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = shl nuw nsw i32 %ix.06, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.06, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
 ; Check also a scenario with full interleave-groups (no gaps) as well as both
 ; load and store groups. We check that when masked-interleave-group is disabled
 ; the predicated loads (and stores) are not vectorized as an
author	Dorit Nuzman <dorit.nuzman@intel.com>	2018-10-22 06:17:09 +0000
committer	Dorit Nuzman <dorit.nuzman@intel.com>	2018-10-22 06:17:09 +0000
commit	3ec99fe21bbc44d0a3ff898644f71aa2e1e8d6ef (patch)
tree	611aadca17d90c373ca52fd74c2c9300313ac141 /llvm/test/Transforms/LoopVectorize/X86
parent	2336dc3c51c8883a1ef171a4236c448b54f6993c (diff)
download	bcm5719-llvm-3ec99fe21bbc44d0a3ff898644f71aa2e1e8d6ef.tar.gz bcm5719-llvm-3ec99fe21bbc44d0a3ff898644f71aa2e1e8d6ef.zip