3 files changed, 368 insertions, 35 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
index d74e26ec20a..30acee4b25c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=GENERIC
 ; RUN: opt -S -mcpu=kryo -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=KRYO
 
@@ -19,15 +20,157 @@ target triple = "aarch64--linux-gnu"
 ;   return sum;
 ; }
 
-; GENERIC-LABEL: @gather_reduce_8x16_i32
+define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+; GENERIC-LABEL: @gather_reduce_8x16_i32(
+; GENERIC-NEXT:  entry:
+; GENERIC-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; GENERIC-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; GENERIC:       for.body.preheader:
+; GENERIC-NEXT:    br label [[FOR_BODY:%.*]]
+; GENERIC:       for.cond.cleanup.loopexit:
+; GENERIC-NEXT:    br label [[FOR_COND_CLEANUP]]
+; GENERIC:       for.cond.cleanup:
+; GENERIC-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; GENERIC-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; GENERIC:       for.body:
+; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; GENERIC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
-; GENERIC: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
-; GENERIC: zext <8 x i16> [[L]] to <8 x i32>
-; GENERIC: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
-; GENERIC: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
-; GENERIC: sext i32 [[X]] to i64
+; KRYO-LABEL: @gather_reduce_8x16_i32(
+; KRYO-NEXT:  entry:
+; KRYO-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; KRYO-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; KRYO:       for.body.preheader:
+; KRYO-NEXT:    br label [[FOR_BODY:%.*]]
+; KRYO:       for.cond.cleanup.loopexit:
+; KRYO-NEXT:    br label [[FOR_COND_CLEANUP]]
+; KRYO:       for.cond.cleanup:
+; KRYO-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; KRYO-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; KRYO:       for.body:
+; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; KRYO-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; KRYO-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; KRYO-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; KRYO-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
-define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
 entry:
   %cmp.99 = icmp sgt i32 %n, 0
   br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
@@ -138,15 +281,157 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
 }
 
-; KRYO-LABEL: @gather_reduce_8x16_i64
+define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+; GENERIC-LABEL: @gather_reduce_8x16_i64(
+; GENERIC-NEXT:  entry:
+; GENERIC-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; GENERIC-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; GENERIC:       for.body.preheader:
+; GENERIC-NEXT:    br label [[FOR_BODY:%.*]]
+; GENERIC:       for.cond.cleanup.loopexit:
+; GENERIC-NEXT:    br label [[FOR_COND_CLEANUP]]
+; GENERIC:       for.cond.cleanup:
+; GENERIC-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; GENERIC-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; GENERIC:       for.body:
+; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; GENERIC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
-; KRYO: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
-; KRYO: zext <8 x i16> [[L]] to <8 x i32>
-; KRYO: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
-; KRYO: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
-; KRYO: sext i32 [[X]] to i64
+; KRYO-LABEL: @gather_reduce_8x16_i64(
+; KRYO-NEXT:  entry:
+; KRYO-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; KRYO-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; KRYO:       for.body.preheader:
+; KRYO-NEXT:    br label [[FOR_BODY:%.*]]
+; KRYO:       for.cond.cleanup.loopexit:
+; KRYO-NEXT:    br label [[FOR_COND_CLEANUP]]
+; KRYO:       for.cond.cleanup:
+; KRYO-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; KRYO-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; KRYO:       for.body:
+; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; KRYO-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; KRYO-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; KRYO-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; KRYO-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
-define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
 entry:
   %cmp.99 = icmp sgt i32 %n, 0
   br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
index 723108fca95..4241cb922ea 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -slp-threshold=-6 -slp-vectorizer -instcombine < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -6,20 +7,26 @@ target triple = "x86_64-unknown-linux-gnu"
 ; These tests ensure that we do not regress due to PR31243. Note that we set
 ; the SLP threshold to force vectorization even when not profitable.
 
-; CHECK-LABEL: @PR31243_zext
-;
 ; When computing minimum sizes, if we can prove the sign bit is zero, we can
 ; zero-extend the roots back to their original sizes.
 ;
-; CHECK: %[[OR:.+]] = or <2 x i8> {{.*}}, <i8 1, i8 1>
-; CHECK: %[[E0:.+]] = extractelement <2 x i8> %[[OR]], i32 0
-; CHECK: %[[Z0:.+]] = zext i8 %[[E0]] to i64
-; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[Z0]]
-; CHECK: %[[E1:.+]] = extractelement <2 x i8> %[[OR]], i32 1
-; CHECK: %[[Z1:.+]] = zext i8 %[[E1]] to i64
-; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[Z1]]
-;
 define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
+; CHECK-LABEL: @PR31243_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMPE4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMPE4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i8 [[TMP8]]
+;
 entry:
   %tmp0 = zext i8 %v0 to i32
   %tmp1 = zext i8 %v1 to i32
@@ -33,8 +40,6 @@ entry:
   ret i8 %tmp8
 }
 
-; CHECK-LABEL: @PR31243_sext
-;
 ; When computing minimum sizes, if we cannot prove the sign bit is zero, we
 ; have to include one extra bit for signedness since we will sign-extend the
 ; roots.
@@ -48,16 +53,24 @@ entry:
 ;        optimization, we make the proposed smaller type (i8) larger (i16) to
 ;        ensure correctness.
 ;
-; CHECK: %[[OR:.+]] = or <2 x i8> {{.*}}, <i8 1, i8 1>
-; CHECK: %[[S0:.+]] = sext <2 x i8> %[[OR]] to <2 x i16>
-; CHECK: %[[E0:.+]] = extractelement <2 x i16> %[[S0]], i32 0
-; CHECK: %[[S1:.+]] = sext i16 %[[E0]] to i64
-; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[S1]]
-; CHECK: %[[E1:.+]] = extractelement <2 x i16> %[[S0]], i32 1
-; CHECK: %[[S2:.+]] = sext i16 %[[E1]] to i64
-; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[S2]]
-;
 define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
+; CHECK-LABEL: @PR31243_sext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i8 [[TMP8]]
+;
 entry:
   %tmp0 = sext i8 %v0 to i32
   %tmp1 = sext i8 %v1 to i32
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
new file mode 100644
index 00000000000..924422dd256
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 | FileCheck %s
+
+define <4 x i32> @sign_extend_v_v(<4 x i16> %lhs) {
+; CHECK-LABEL: @sign_extend_v_v(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x i16> [[LHS:%.*]], i32 0
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[VECEXT]] to i32
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[CONV]], i32 0
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x i16> [[LHS]], i32 1
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[VECEXT1]] to i32
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[CONV2]], i32 1
+; CHECK-NEXT:    [[VECEXT4:%.*]] = extractelement <4 x i16> [[LHS]], i32 2
+; CHECK-NEXT:    [[CONV5:%.*]] = sext i16 [[VECEXT4]] to i32
+; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[CONV5]], i32 2
+; CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <4 x i16> [[LHS]], i32 3
+; CHECK-NEXT:    [[CONV8:%.*]] = sext i16 [[VECEXT7]] to i32
+; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[CONV8]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[VECINIT9]]
+;
+entry:
+  %vecext = extractelement <4 x i16> %lhs, i32 0
+  %conv = sext i16 %vecext to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %vecext1 = extractelement <4 x i16> %lhs, i32 1
+  %conv2 = sext i16 %vecext1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %vecext4 = extractelement <4 x i16> %lhs, i32 2
+  %conv5 = sext i16 %vecext4 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %vecext7 = extractelement <4 x i16> %lhs, i32 3
+  %conv8 = sext i16 %vecext7 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+}