From 34da6dd696439e195e7b650d97a95913101a88d9 Mon Sep 17 00:00:00 2001 From: Dorit Nuzman Date: Wed, 31 Oct 2018 09:57:56 +0000 Subject: [LV] Support vectorization of interleave-groups that require an epilog under optsize using masked wide loads Under Opt for Size, the vectorizer does not vectorize interleave-groups that have gaps at the end of the group (such as a loop that reads only the even elements: a[2*i]) because that implies that we'll require a scalar epilogue (which is not allowed under Opt for Size). This patch extends the support for masked-interleave-groups (introduced by D53011 for conditional accesses) to also cover the case of gaps in a group of loads; Targets that enable the masked-interleave-group feature don't have to invalidate interleave-groups of loads with gaps; they could now use masked wide-loads and shuffles (if that's what the cost model selects). Reviewers: Ayal, hsaito, dcaballe, fhahn Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D53668 llvm-svn: 345705 --- .../X86/x86-interleaved-accesses-masked-group.ll | 305 ++++++++++++++++----- 1 file changed, 235 insertions(+), 70 deletions(-) (limited to 'llvm/test/Transforms/LoopVectorize') diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index a2304e447f5..0b23d6286f9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -84,26 +84,39 @@ for.end: ; Exactly the same scenario except we are now optimizing for size, therefore ; we check that no scalar epilogue is created. Since we can't create an epilog -; the interleave-group is invalidated because is has gaps, so we end up -; scalarizing. -; (Before the fix that this test checks, we used to create an epilogue despite -; optsize, and vectorized the access as an interleaved-group. This is now fixed, -; and we make sure that a scalar epilogue does not exist). +; we need the ability to mask out the gaps. +; When enable-masked-interleaved-access is enabled, the interleave-groups will +; be vectorized with masked wide-loads with the mask properly shuffled and +; And-ed with the gaps mask. ;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize( -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 -;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], -;ENABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 -;ENABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue -;ENABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 -;ENABLED_MASKED_STRIDED-NOT: for.body: -;ENABLED_MASKED_STRIDED: for.end: +;ENABLED_MASKED_STRIDED-NEXT: entry: +;ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +;ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +;ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +;ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +;ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +;ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +;ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> +;ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], +;ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* +;ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]]) +;ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +;ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +;ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +;ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]] +;ENABLED_MASKED_STRIDED-NOT: for.body: +;ENABLED_MASKED_STRIDED: for.end: +;ENABLED_MASKED_STRIDED-NEXT: ret void + define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize { entry: @@ -138,12 +151,15 @@ for.end: ; remainder loop into the main loop using masking) together with interleaved- ; groups. ; When masked-interleave-group is disabled the interleave-groups will be -; invalidated during Legality checks; -; When masked-interleave-group is enabled the interleave-groups will be -; invalidated during cost-model checks, because we don't have a way to support -; interleave-groups with gaps that require an epilogue using masking. -; So in both cases we check for no epilogue and scalarized conditional accesses. - +; invalidated during Legality checks; So there we check for no epilogue +; and for scalarized conditional accesses. +; When masked-interleave-group is enabled we check that there is no epilogue, +; and that the interleave-groups are vectorized using proper masking (with +; shuffling of the mask feeding the wide masked load/store). +; The mask itself is an And of two masks: one that masks away the remainder +; iterations, and one that masks away the 'else' of the 'if' statement. +; The shuffled mask is also And-ed with the gaps mask. +; ; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p, ; unsigned char* restrict q, ; unsigned char guard, @@ -178,21 +194,39 @@ for.end: ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; ENABLED_MASKED_STRIDED: vector.ph: +; ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}} -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; ENABLED_MASKED_STRIDED: pred.load.if: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]] ; ENABLED_MASKED_STRIDED-NOT: for.body: ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void @@ -231,17 +265,115 @@ for.end: ret void } +; Same, with stride 3. This is to check the gaps-mask and the shuffled mask +; with a different stride. +; So accesses are with gaps under Optsize scenario again, with unknown trip- +; count, in order to check the behavior of folding-the-tail (folding the +; remainder loop into the main loop using masking) together with interleaved- +; groups. +; When masked-interleave-group is enabled we check that there is no epilogue, +; and that the interleave-groups are vectorized using proper masking (with +; shuffling of the mask feeding the wide masked load/store). +; The mask itself is an And of two masks: one that masks away the remainder +; iterations, and one that masks away the 'else' of the 'if' statement. +; The shuffled mask is also And-ed with the gaps mask. +; +; void masked_strided3_optsize_unknown_tc(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard, +; int n) { +; for(ix=0; ix < n; ++ix) { +; if (ix > guard) { +; char t = p[3*ix]; +; q[ix] = t; +; } +; } +; } + + +; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; ENABLED_MASKED_STRIDED: vector.ph: +; ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]] +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void +; +define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.010, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = mul nsw i32 %ix.010, 3 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010 + store i8 %0, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.010, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + -; Same, but the load/store are not predicated. The interleave-group is -; invalidated here as well because we have gaps and we can't create an epilog. -; The access is thus scalarized. +; Back to stride 2 with gaps with a known trip count under opt for size, +; but this time the load/store are not predicated. +; When enable-masked-interleaved-access is disabled, the interleave-groups will +; be invalidated during cost-model checks because we have gaps and we can't +; create an epilog. The access is thus scalarized. ; (Before the fix that this test checks, we used to create an epilogue despite ; optsize, and vectorized the access as an interleaved-group. This is now fixed, ; and we make sure that a scalar epilogue does not exist). -; Since enable-masked-interleaved-accesses currently only affects predicated -; accesses, the behavior is the same with this switch set/unset. - - +; When enable-masked-interleaved-access is enabled, the interleave-groups will +; be vectorized with masked wide-loads (masking away the gaps). +; ; void unconditional_strided1_optsize(const unsigned char* restrict p, ; unsigned char* restrict q, ; unsigned char guard) { @@ -259,11 +391,25 @@ for.end: ;DISABLED_MASKED_STRIDED: for.end: ;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize( -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 -;ENABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0 -;ENABLED_MASKED_STRIDED-NOT: for.body: -;ENABLED_MASKED_STRIDED: for.end: +;ENABLED_MASKED_STRIDED-NEXT: entry: +;ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +;ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* +;ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> , <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +;ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* +;ENABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1 +;ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +;ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +;ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]] +;ENABLED_MASKED_STRIDED-NOT: for.body: +;ENABLED_MASKED_STRIDED: for.end: +;ENABLED_MASKED_STRIDED-NEXT: ret void + define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize { entry: @@ -289,13 +435,17 @@ for.end: ; Unconditioal accesses with gaps under Optsize scenario again, with unknown ; trip-count this time, in order to check the behavior of folding-the-tail ; (folding the remainder loop into the main loop using masking) together with -; interleaved-groups. -; The interleave-groups will be invalidated during cost-model checks, because -; we don't have a way to support interleave-groups with gaps that require an -; epilogue using masking (even when interleaved-masking is enabled; this -; is not yet supported). -; So we check for no epilogue and for scalarized conditional accesses. - +; interleaved-groups. Folding-the-tail turns the accesses to conditional which +; requires proper masking. In addition we need to mask out the gaps (all +; because we are not allowed to use an epilog due to optsize). +; When enable-masked-interleaved-access is disabled, the interleave-groups will +; be invalidated during cost-model checks. So there we check for no epilogue +; and for scalarized conditional accesses. +; When masked-interleave-group is enabled we check that there is no epilogue, +; and that the interleave-groups are vectorized using proper masking (with +; shuffling of the mask feeding the wide masked load/store). +; The shuffled mask is also And-ed with the gaps mask. +; ; for(ix=0; ix < n; ++ix) { ; char t = p[2*ix]; ; q[ix] = t; @@ -319,21 +469,36 @@ for.end: ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void - ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; ENABLED_MASKED_STRIDED: vector.ph: +; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; ENABLED_MASKED_STRIDED: pred.load.if: -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]] ; ENABLED_MASKED_STRIDED-NOT: for.body: ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void -- cgit v1.2.3