diff options
3 files changed, 368 insertions, 35 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll index d74e26ec20a..30acee4b25c 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=GENERIC ; RUN: opt -S -mcpu=kryo -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=KRYO @@ -19,15 +20,157 @@ target triple = "aarch64--linux-gnu" ; return sum; ; } -; GENERIC-LABEL: @gather_reduce_8x16_i32 +define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +; GENERIC-LABEL: @gather_reduce_8x16_i32( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; GENERIC-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; GENERIC: for.body.preheader: +; GENERIC-NEXT: br label [[FOR_BODY:%.*]] +; GENERIC: for.cond.cleanup.loopexit: +; GENERIC-NEXT: br label [[FOR_COND_CLEANUP]] +; GENERIC: for.cond.cleanup: +; GENERIC-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; GENERIC-NEXT: ret i32 [[SUM_0_LCSSA]] +; GENERIC: for.body: +; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* +; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 +; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> +; GENERIC-NEXT: [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>* +; GENERIC-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 +; GENERIC-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32> +; GENERIC-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]] +; GENERIC-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0 +; GENERIC-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]] +; GENERIC-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; GENERIC-NEXT: [[CONV3:%.*]] = zext i16 [[TMP9]] to i32 +; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] +; GENERIC-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1 +; GENERIC-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 +; GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]] +; GENERIC-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 +; GENERIC-NEXT: [[CONV11:%.*]] = zext i16 [[TMP12]] to i32 +; GENERIC-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] +; GENERIC-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2 +; GENERIC-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; GENERIC-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]] +; GENERIC-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2 +; GENERIC-NEXT: [[CONV20:%.*]] = zext i16 [[TMP15]] to i32 +; GENERIC-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] +; GENERIC-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3 +; GENERIC-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 +; GENERIC-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]] +; GENERIC-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2 +; GENERIC-NEXT: [[CONV29:%.*]] = zext i16 [[TMP18]] to i32 +; GENERIC-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] +; GENERIC-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4 +; GENERIC-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; GENERIC-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]] +; GENERIC-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2 +; GENERIC-NEXT: [[CONV38:%.*]] = zext i16 [[TMP21]] to i32 +; GENERIC-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] +; GENERIC-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5 +; GENERIC-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64 +; GENERIC-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]] +; GENERIC-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2 +; GENERIC-NEXT: [[CONV47:%.*]] = zext i16 [[TMP24]] to i32 +; GENERIC-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] +; GENERIC-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6 +; GENERIC-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64 +; GENERIC-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]] +; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 +; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 +; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 +; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7 +; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 +; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] +; GENERIC-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2 +; GENERIC-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32 +; GENERIC-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] +; GENERIC-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 +; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; GENERIC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; -; GENERIC: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -; GENERIC: zext <8 x i16> [[L]] to <8 x i32> -; GENERIC: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> -; GENERIC: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] -; GENERIC: sext i32 [[X]] to i64 +; KRYO-LABEL: @gather_reduce_8x16_i32( +; KRYO-NEXT: entry: +; KRYO-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; KRYO-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; KRYO: for.body.preheader: +; KRYO-NEXT: br label [[FOR_BODY:%.*]] +; KRYO: for.cond.cleanup.loopexit: +; KRYO-NEXT: br label [[FOR_COND_CLEANUP]] +; KRYO: for.cond.cleanup: +; KRYO-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; KRYO-NEXT: ret i32 [[SUM_0_LCSSA]] +; KRYO: for.body: +; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* +; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 +; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> +; KRYO-NEXT: [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>* +; KRYO-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 +; KRYO-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32> +; KRYO-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]] +; KRYO-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0 +; KRYO-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; KRYO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]] +; KRYO-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; KRYO-NEXT: [[CONV3:%.*]] = zext i16 [[TMP9]] to i32 +; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] +; KRYO-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1 +; KRYO-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 +; KRYO-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]] +; KRYO-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 +; KRYO-NEXT: [[CONV11:%.*]] = zext i16 [[TMP12]] to i32 +; KRYO-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] +; KRYO-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2 +; KRYO-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; KRYO-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]] +; KRYO-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2 +; KRYO-NEXT: [[CONV20:%.*]] = zext i16 [[TMP15]] to i32 +; KRYO-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] +; KRYO-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3 +; KRYO-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 +; KRYO-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]] +; KRYO-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2 +; KRYO-NEXT: [[CONV29:%.*]] = zext i16 [[TMP18]] to i32 +; KRYO-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] +; KRYO-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4 +; KRYO-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; KRYO-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]] +; KRYO-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2 +; KRYO-NEXT: [[CONV38:%.*]] = zext i16 [[TMP21]] to i32 +; KRYO-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] +; KRYO-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5 +; KRYO-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64 +; KRYO-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]] +; KRYO-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2 +; KRYO-NEXT: [[CONV47:%.*]] = zext i16 [[TMP24]] to i32 +; KRYO-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] +; KRYO-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6 +; KRYO-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64 +; KRYO-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]] +; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 +; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 +; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 +; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7 +; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 +; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] +; KRYO-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2 +; KRYO-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32 +; KRYO-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] +; KRYO-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 +; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; KRYO-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; -define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { entry: %cmp.99 = icmp sgt i32 %n, 0 br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup @@ -138,15 +281,157 @@ for.body: br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body } -; KRYO-LABEL: @gather_reduce_8x16_i64 +define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +; GENERIC-LABEL: @gather_reduce_8x16_i64( +; GENERIC-NEXT: entry: +; GENERIC-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; GENERIC-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; GENERIC: for.body.preheader: +; GENERIC-NEXT: br label [[FOR_BODY:%.*]] +; GENERIC: for.cond.cleanup.loopexit: +; GENERIC-NEXT: br label [[FOR_COND_CLEANUP]] +; GENERIC: for.cond.cleanup: +; GENERIC-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; GENERIC-NEXT: ret i32 [[SUM_0_LCSSA]] +; GENERIC: for.body: +; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* +; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 +; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> +; GENERIC-NEXT: [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>* +; GENERIC-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 +; GENERIC-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32> +; GENERIC-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]] +; GENERIC-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0 +; GENERIC-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]] +; GENERIC-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; GENERIC-NEXT: [[CONV3:%.*]] = zext i16 [[TMP9]] to i32 +; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] +; GENERIC-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1 +; GENERIC-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 +; GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]] +; GENERIC-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 +; GENERIC-NEXT: [[CONV11:%.*]] = zext i16 [[TMP12]] to i32 +; GENERIC-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] +; GENERIC-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2 +; GENERIC-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; GENERIC-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]] +; GENERIC-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2 +; GENERIC-NEXT: [[CONV20:%.*]] = zext i16 [[TMP15]] to i32 +; GENERIC-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] +; GENERIC-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3 +; GENERIC-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 +; GENERIC-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]] +; GENERIC-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2 +; GENERIC-NEXT: [[CONV29:%.*]] = zext i16 [[TMP18]] to i32 +; GENERIC-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] +; GENERIC-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4 +; GENERIC-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; GENERIC-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]] +; GENERIC-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2 +; GENERIC-NEXT: [[CONV38:%.*]] = zext i16 [[TMP21]] to i32 +; GENERIC-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] +; GENERIC-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5 +; GENERIC-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64 +; GENERIC-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]] +; GENERIC-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2 +; GENERIC-NEXT: [[CONV47:%.*]] = zext i16 [[TMP24]] to i32 +; GENERIC-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] +; GENERIC-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6 +; GENERIC-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64 +; GENERIC-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]] +; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 +; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 +; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 +; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7 +; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 +; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] +; GENERIC-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2 +; GENERIC-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32 +; GENERIC-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] +; GENERIC-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 +; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; GENERIC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; -; KRYO: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -; KRYO: zext <8 x i16> [[L]] to <8 x i32> -; KRYO: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> -; KRYO: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] -; KRYO: sext i32 [[X]] to i64 +; KRYO-LABEL: @gather_reduce_8x16_i64( +; KRYO-NEXT: entry: +; KRYO-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; KRYO-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; KRYO: for.body.preheader: +; KRYO-NEXT: br label [[FOR_BODY:%.*]] +; KRYO: for.cond.cleanup.loopexit: +; KRYO-NEXT: br label [[FOR_COND_CLEANUP]] +; KRYO: for.cond.cleanup: +; KRYO-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; KRYO-NEXT: ret i32 [[SUM_0_LCSSA]] +; KRYO: for.body: +; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* +; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 +; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> +; KRYO-NEXT: [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>* +; KRYO-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2 +; KRYO-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32> +; KRYO-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]] +; KRYO-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0 +; KRYO-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; KRYO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]] +; KRYO-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; KRYO-NEXT: [[CONV3:%.*]] = zext i16 [[TMP9]] to i32 +; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]] +; KRYO-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1 +; KRYO-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 +; KRYO-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]] +; KRYO-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 +; KRYO-NEXT: [[CONV11:%.*]] = zext i16 [[TMP12]] to i32 +; KRYO-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]] +; KRYO-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2 +; KRYO-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; KRYO-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]] +; KRYO-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2 +; KRYO-NEXT: [[CONV20:%.*]] = zext i16 [[TMP15]] to i32 +; KRYO-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]] +; KRYO-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3 +; KRYO-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 +; KRYO-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]] +; KRYO-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2 +; KRYO-NEXT: [[CONV29:%.*]] = zext i16 [[TMP18]] to i32 +; KRYO-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]] +; KRYO-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4 +; KRYO-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64 +; KRYO-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]] +; KRYO-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2 +; KRYO-NEXT: [[CONV38:%.*]] = zext i16 [[TMP21]] to i32 +; KRYO-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]] +; KRYO-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5 +; KRYO-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64 +; KRYO-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]] +; KRYO-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2 +; KRYO-NEXT: [[CONV47:%.*]] = zext i16 [[TMP24]] to i32 +; KRYO-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]] +; KRYO-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6 +; KRYO-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64 +; KRYO-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]] +; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 +; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 +; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 +; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7 +; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 +; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] +; KRYO-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2 +; KRYO-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32 +; KRYO-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]] +; KRYO-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1 +; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] +; KRYO-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; -define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { entry: %cmp.99 = icmp sgt i32 %n, 0 br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll index 723108fca95..4241cb922ea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -slp-threshold=-6 -slp-vectorizer -instcombine < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -6,20 +7,26 @@ target triple = "x86_64-unknown-linux-gnu" ; These tests ensure that we do not regress due to PR31243. Note that we set ; the SLP threshold to force vectorization even when not profitable. -; CHECK-LABEL: @PR31243_zext -; ; When computing minimum sizes, if we can prove the sign bit is zero, we can ; zero-extend the roots back to their original sizes. ; -; CHECK: %[[OR:.+]] = or <2 x i8> {{.*}}, <i8 1, i8 1> -; CHECK: %[[E0:.+]] = extractelement <2 x i8> %[[OR]], i32 0 -; CHECK: %[[Z0:.+]] = zext i8 %[[E0]] to i64 -; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[Z0]] -; CHECK: %[[E1:.+]] = extractelement <2 x i8> %[[OR]], i32 1 -; CHECK: %[[Z1:.+]] = zext i8 %[[E1]] to i64 -; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[Z1]] -; define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { +; CHECK-LABEL: @PR31243_zext( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i64 +; CHECK-NEXT: [[TMPE4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMPE4]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] +; entry: %tmp0 = zext i8 %v0 to i32 %tmp1 = zext i8 %v1 to i32 @@ -33,8 +40,6 @@ entry: ret i8 %tmp8 } -; CHECK-LABEL: @PR31243_sext -; ; When computing minimum sizes, if we cannot prove the sign bit is zero, we ; have to include one extra bit for signedness since we will sign-extend the ; roots. @@ -48,16 +53,24 @@ entry: ; optimization, we make the proposed smaller type (i8) larger (i16) to ; ensure correctness. ; -; CHECK: %[[OR:.+]] = or <2 x i8> {{.*}}, <i8 1, i8 1> -; CHECK: %[[S0:.+]] = sext <2 x i8> %[[OR]] to <2 x i16> -; CHECK: %[[E0:.+]] = extractelement <2 x i16> %[[S0]], i32 0 -; CHECK: %[[S1:.+]] = sext i16 %[[E0]] to i64 -; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[S1]] -; CHECK: %[[E1:.+]] = extractelement <2 x i16> %[[S0]], i32 1 -; CHECK: %[[S2:.+]] = sext i16 %[[E1]] to i64 -; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[S2]] -; define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { +; CHECK-LABEL: @PR31243_sext( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1> +; CHECK-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = sext i16 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = sext i16 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: ret i8 [[TMP8]] +; entry: %tmp0 = sext i8 %v0 to i32 %tmp1 = sext i8 %v1 to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll new file mode 100644 index 00000000000..924422dd256 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 | FileCheck %s + +define <4 x i32> @sign_extend_v_v(<4 x i16> %lhs) { +; CHECK-LABEL: @sign_extend_v_v( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x i16> [[LHS:%.*]], i32 0 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[VECEXT]] to i32 +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[CONV]], i32 0 +; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i16> [[LHS]], i32 1 +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[VECEXT1]] to i32 +; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[CONV2]], i32 1 +; CHECK-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i16> [[LHS]], i32 2 +; CHECK-NEXT: [[CONV5:%.*]] = sext i16 [[VECEXT4]] to i32 +; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[CONV5]], i32 2 +; CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i16> [[LHS]], i32 3 +; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[VECEXT7]] to i32 +; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[CONV8]], i32 3 +; CHECK-NEXT: ret <4 x i32> [[VECINIT9]] +; +entry: + %vecext = extractelement <4 x i16> %lhs, i32 0 + %conv = sext i16 %vecext to i32 + %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 + %vecext1 = extractelement <4 x i16> %lhs, i32 1 + %conv2 = sext i16 %vecext1 to i32 + %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1 + %vecext4 = extractelement <4 x i16> %lhs, i32 2 + %conv5 = sext i16 %vecext4 to i32 + %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2 + %vecext7 = extractelement <4 x i16> %lhs, i32 3 + %conv8 = sext i16 %vecext7 to i32 + %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3 + ret <4 x i32> %vecinit9 +} |