; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -slp-threshold=-10 | FileCheck %s --check-prefix=THRESHOLD

@n = external local_unnamed_addr global i32, align 4
@arr = common local_unnamed_addr global [20 x float] zeroinitializer, align 16
@arr1 = common local_unnamed_addr global [20 x float] zeroinitializer, align 16
@res = external local_unnamed_addr global float, align 4

define float @baz() {
; CHECK-LABEL: @baz(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
; CHECK-NEXT:    store float [[ADD19_3]], float* @res, align 4
; CHECK-NEXT:    ret float [[ADD19_3]]
;
; THRESHOLD-LABEL: @baz(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
; THRESHOLD-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
; THRESHOLD-NEXT:    store float [[ADD19_3]], float* @res, align 4
; THRESHOLD-NEXT:    ret float [[ADD19_3]]
;
entry:
  %0 = load i32, i32* @n, align 4
  %mul = mul nsw i32 %0, 3
  %conv = sitofp i32 %mul to float
  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
  %mul4 = fmul fast float %2, %1
  %add = fadd fast float %mul4, %conv
  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
  %mul4.1 = fmul fast float %4, %3
  %add.1 = fadd fast float %mul4.1, %add
  %5 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
  %mul4.2 = fmul fast float %6, %5
  %add.2 = fadd fast float %mul4.2, %add.1
  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
  %8 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
  %mul4.3 = fmul fast float %8, %7
  %add.3 = fadd fast float %mul4.3, %add.2
  %add7 = fadd fast float %add.3, %conv
  %add19 = fadd fast float %mul4, %add7
  %add19.1 = fadd fast float %mul4.1, %add19
  %add19.2 = fadd fast float %mul4.2, %add19.1
  %add19.3 = fadd fast float %mul4.3, %add19.2
  store float %add19.3, float* @res, align 4
  ret float %add19.3
}

define float @bazz() {
; CHECK-LABEL: @bazz(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
; CHECK-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
; CHECK-NEXT:    ret float [[OP_EXTRA5]]
;
; THRESHOLD-LABEL: @bazz(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
; THRESHOLD-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
;
entry:
  %0 = load i32, i32* @n, align 4
  %mul = mul nsw i32 %0, 3
  %conv = sitofp i32 %mul to float
  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
  %mul4 = fmul fast float %2, %1
  %add = fadd fast float %mul4, %conv
  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
  %mul4.1 = fmul fast float %4, %3
  %add.1 = fadd fast float %mul4.1, %add
  %5 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
  %mul4.2 = fmul fast float %6, %5
  %add.2 = fadd fast float %mul4.2, %add.1
  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
  %8 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
  %mul4.3 = fmul fast float %8, %7
  %add.3 = fadd fast float %mul4.3, %add.2
  %mul5 = shl nsw i32 %0, 2
  %conv6 = sitofp i32 %mul5 to float
  %add7 = fadd fast float %add.3, %conv6
  %9 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4), align 16
  %10 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4), align 16
  %mul18 = fmul fast float %10, %9
  %add19 = fadd fast float %mul18, %add7
  %11 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 5), align 4
  %12 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 5), align 4
  %mul18.1 = fmul fast float %12, %11
  %add19.1 = fadd fast float %mul18.1, %add19
  %13 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6), align 8
  %14 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6), align 8
  %mul18.2 = fmul fast float %14, %13
  %add19.2 = fadd fast float %mul18.2, %add19.1
  %15 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 7), align 4
  %16 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 7), align 4
  %mul18.3 = fmul fast float %16, %15
  %add19.3 = fadd fast float %mul18.3, %add19.2
  store float %add19.3, float* @res, align 4
  ret float %add19.3
}

define float @bazzz() {
; CHECK-LABEL: @bazzz(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
; CHECK-NEXT:    store float [[TMP5]], float* @res, align 4
; CHECK-NEXT:    ret float [[TMP5]]
;
; THRESHOLD-LABEL: @bazzz(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
; THRESHOLD-NEXT:    store float [[TMP5]], float* @res, align 4
; THRESHOLD-NEXT:    ret float [[TMP5]]
;
entry:
  %0 = load i32, i32* @n, align 4
  %conv = sitofp i32 %0 to float
  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
  %mul = fmul fast float %2, %1
  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
  %mul.1 = fmul fast float %4, %3
  %5 = fadd fast float %mul.1, %mul
  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
  %mul.2 = fmul fast float %7, %6
  %8 = fadd fast float %mul.2, %5
  %9 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
  %10 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
  %mul.3 = fmul fast float %10, %9
  %11 = fadd fast float %mul.3, %8
  %12 = fmul fast float %conv, %11
  store float %12, float* @res, align 4
  ret float %12
}

define i32 @foo() {
; CHECK-LABEL: @foo(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
; CHECK-NEXT:    store i32 [[CONV4]], i32* @n, align 4
; CHECK-NEXT:    ret i32 [[CONV4]]
;
; THRESHOLD-LABEL: @foo(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
; THRESHOLD-NEXT:    [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
; THRESHOLD-NEXT:    store i32 [[CONV4]], i32* @n, align 4
; THRESHOLD-NEXT:    ret i32 [[CONV4]]
;
entry:
  %0 = load i32, i32* @n, align 4
  %conv = sitofp i32 %0 to float
  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
  %mul = fmul fast float %2, %1
  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
  %mul.1 = fmul fast float %4, %3
  %5 = fadd fast float %mul.1, %mul
  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
  %mul.2 = fmul fast float %7, %6
  %8 = fadd fast float %mul.2, %5
  %9 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
  %10 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
  %mul.3 = fmul fast float %10, %9
  %11 = fadd fast float %mul.3, %8
  %12 = fmul fast float %conv, %11
  %conv4 = fptosi float %12 to i32
  store i32 %conv4, i32* @n, align 4
  ret i32 %conv4
}

define float @bar() {
; CHECK-LABEL: @bar(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
; CHECK-NEXT:    store float [[TMP3]], float* @res, align 4
; CHECK-NEXT:    ret float [[TMP3]]
;
; THRESHOLD-LABEL: @bar(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
; THRESHOLD-NEXT:    store float [[TMP3]], float* @res, align 4
; THRESHOLD-NEXT:    ret float [[TMP3]]
;
entry:
  %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
  %mul = fmul fast float %1, %0
  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
  %mul3 = fmul fast float %3, %2
  %cmp4 = fcmp fast ogt float %mul, %mul3
  %max.0.mul3 = select i1 %cmp4, float %mul, float %mul3
  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
  %5 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
  %mul3.1 = fmul fast float %5, %4
  %cmp4.1 = fcmp fast ogt float %max.0.mul3, %mul3.1
  %max.0.mul3.1 = select i1 %cmp4.1, float %max.0.mul3, float %mul3.1
  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
  %mul3.2 = fmul fast float %7, %6
  %cmp4.2 = fcmp fast ogt float %max.0.mul3.1, %mul3.2
  %max.0.mul3.2 = select i1 %cmp4.2, float %max.0.mul3.1, float %mul3.2
  store float %max.0.mul3.2, float* @res, align 4
  ret float %max.0.mul3.2
}

define float @f(float* nocapture readonly %x) {
; CHECK-LABEL: @f(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>*
; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
; CHECK-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
; CHECK-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
; CHECK-NEXT:    [[ARRAYIDX_33:%.*]] = getelementptr inbounds float, float* [[X]], i64 33
; CHECK-NEXT:    [[ARRAYIDX_34:%.*]] = getelementptr inbounds float, float* [[X]], i64 34
; CHECK-NEXT:    [[ARRAYIDX_35:%.*]] = getelementptr inbounds float, float* [[X]], i64 35
; CHECK-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds float, float* [[X]], i64 36
; CHECK-NEXT:    [[ARRAYIDX_37:%.*]] = getelementptr inbounds float, float* [[X]], i64 37
; CHECK-NEXT:    [[ARRAYIDX_38:%.*]] = getelementptr inbounds float, float* [[X]], i64 38
; CHECK-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds float, float* [[X]], i64 39
; CHECK-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds float, float* [[X]], i64 40
; CHECK-NEXT:    [[ARRAYIDX_41:%.*]] = getelementptr inbounds float, float* [[X]], i64 41
; CHECK-NEXT:    [[ARRAYIDX_42:%.*]] = getelementptr inbounds float, float* [[X]], i64 42
; CHECK-NEXT:    [[ARRAYIDX_43:%.*]] = getelementptr inbounds float, float* [[X]], i64 43
; CHECK-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds float, float* [[X]], i64 44
; CHECK-NEXT:    [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45
; CHECK-NEXT:    [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46
; CHECK-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
; CHECK-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <16 x float> [[TMP1]], [[RDX_SHUF9]]
; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <16 x float> [[BIN_RDX10]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <16 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
; CHECK-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <16 x float> [[BIN_RDX12]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <16 x float> [[BIN_RDX12]], [[RDX_SHUF13]]
; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
; CHECK-NEXT:    ret float [[OP_RDX]]
;
; THRESHOLD-LABEL: @f(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>*
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
; THRESHOLD-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
; THRESHOLD-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
; THRESHOLD-NEXT:    [[ARRAYIDX_33:%.*]] = getelementptr inbounds float, float* [[X]], i64 33
; THRESHOLD-NEXT:    [[ARRAYIDX_34:%.*]] = getelementptr inbounds float, float* [[X]], i64 34
; THRESHOLD-NEXT:    [[ARRAYIDX_35:%.*]] = getelementptr inbounds float, float* [[X]], i64 35
; THRESHOLD-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds float, float* [[X]], i64 36
; THRESHOLD-NEXT:    [[ARRAYIDX_37:%.*]] = getelementptr inbounds float, float* [[X]], i64 37
; THRESHOLD-NEXT:    [[ARRAYIDX_38:%.*]] = getelementptr inbounds float, float* [[X]], i64 38
; THRESHOLD-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds float, float* [[X]], i64 39
; THRESHOLD-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds float, float* [[X]], i64 40
; THRESHOLD-NEXT:    [[ARRAYIDX_41:%.*]] = getelementptr inbounds float, float* [[X]], i64 41
; THRESHOLD-NEXT:    [[ARRAYIDX_42:%.*]] = getelementptr inbounds float, float* [[X]], i64 42
; THRESHOLD-NEXT:    [[ARRAYIDX_43:%.*]] = getelementptr inbounds float, float* [[X]], i64 43
; THRESHOLD-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds float, float* [[X]], i64 44
; THRESHOLD-NEXT:    [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45
; THRESHOLD-NEXT:    [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46
; THRESHOLD-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
; THRESHOLD-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <16 x float> [[TMP1]], [[RDX_SHUF9]]
; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <16 x float> [[BIN_RDX10]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <16 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
; THRESHOLD-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <16 x float> [[BIN_RDX12]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <16 x float> [[BIN_RDX12]], [[RDX_SHUF13]]
; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
; THRESHOLD-NEXT:    ret float [[OP_RDX]]
;
  entry:
  %0 = load float, float* %x, align 4
  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 1
  %1 = load float, float* %arrayidx.1, align 4
  %add.1 = fadd fast float %1, %0
  %arrayidx.2 = getelementptr inbounds float, float* %x, i64 2
  %2 = load float, float* %arrayidx.2, align 4
  %add.2 = fadd fast float %2, %add.1
  %arrayidx.3 = getelementptr inbounds float, float* %x, i64 3
  %3 = load float, float* %arrayidx.3, align 4
  %add.3 = fadd fast float %3, %add.2
  %arrayidx.4 = getelementptr inbounds float, float* %x, i64 4
  %4 = load float, float* %arrayidx.4, align 4
  %add.4 = fadd fast float %4, %add.3
  %arrayidx.5 = getelementptr inbounds float, float* %x, i64 5
  %5 = load float, float* %arrayidx.5, align 4
  %add.5 = fadd fast float %5, %add.4
  %arrayidx.6 = getelementptr inbounds float, float* %x, i64 6
  %6 = load float, float* %arrayidx.6, align 4
  %add.6 = fadd fast float %6, %add.5
  %arrayidx.7 = getelementptr inbounds float, float* %x, i64 7
  %7 = load float, float* %arrayidx.7, align 4
  %add.7 = fadd fast float %7, %add.6
  %arrayidx.8 = getelementptr inbounds float, float* %x, i64 8
  %8 = load float, float* %arrayidx.8, align 4
  %add.8 = fadd fast float %8, %add.7
  %arrayidx.9 = getelementptr inbounds float, float* %x, i64 9
  %9 = load float, float* %arrayidx.9, align 4
  %add.9 = fadd fast float %9, %add.8
  %arrayidx.10 = getelementptr inbounds float, float* %x, i64 10
  %10 = load float, float* %arrayidx.10, align 4
  %add.10 = fadd fast float %10, %add.9
  %arrayidx.11 = getelementptr inbounds float, float* %x, i64 11
  %11 = load float, float* %arrayidx.11, align 4
  %add.11 = fadd fast float %11, %add.10
  %arrayidx.12 = getelementptr inbounds float, float* %x, i64 12
  %12 = load float, float* %arrayidx.12, align 4
  %add.12 = fadd fast float %12, %add.11
  %arrayidx.13 = getelementptr inbounds float, float* %x, i64 13
  %13 = load float, float* %arrayidx.13, align 4
  %add.13 = fadd fast float %13, %add.12
  %arrayidx.14 = getelementptr inbounds float, float* %x, i64 14
  %14 = load float, float* %arrayidx.14, align 4
  %add.14 = fadd fast float %14, %add.13
  %arrayidx.15 = getelementptr inbounds float, float* %x, i64 15
  %15 = load float, float* %arrayidx.15, align 4
  %add.15 = fadd fast float %15, %add.14
  %arrayidx.16 = getelementptr inbounds float, float* %x, i64 16
  %16 = load float, float* %arrayidx.16, align 4
  %add.16 = fadd fast float %16, %add.15
  %arrayidx.17 = getelementptr inbounds float, float* %x, i64 17
  %17 = load float, float* %arrayidx.17, align 4
  %add.17 = fadd fast float %17, %add.16
  %arrayidx.18 = getelementptr inbounds float, float* %x, i64 18
  %18 = load float, float* %arrayidx.18, align 4
  %add.18 = fadd fast float %18, %add.17
  %arrayidx.19 = getelementptr inbounds float, float* %x, i64 19
  %19 = load float, float* %arrayidx.19, align 4
  %add.19 = fadd fast float %19, %add.18
  %arrayidx.20 = getelementptr inbounds float, float* %x, i64 20
  %20 = load float, float* %arrayidx.20, align 4
  %add.20 = fadd fast float %20, %add.19
  %arrayidx.21 = getelementptr inbounds float, float* %x, i64 21
  %21 = load float, float* %arrayidx.21, align 4
  %add.21 = fadd fast float %21, %add.20
  %arrayidx.22 = getelementptr inbounds float, float* %x, i64 22
  %22 = load float, float* %arrayidx.22, align 4
  %add.22 = fadd fast float %22, %add.21
  %arrayidx.23 = getelementptr inbounds float, float* %x, i64 23
  %23 = load float, float* %arrayidx.23, align 4
  %add.23 = fadd fast float %23, %add.22
  %arrayidx.24 = getelementptr inbounds float, float* %x, i64 24
  %24 = load float, float* %arrayidx.24, align 4
  %add.24 = fadd fast float %24, %add.23
  %arrayidx.25 = getelementptr inbounds float, float* %x, i64 25
  %25 = load float, float* %arrayidx.25, align 4
  %add.25 = fadd fast float %25, %add.24
  %arrayidx.26 = getelementptr inbounds float, float* %x, i64 26
  %26 = load float, float* %arrayidx.26, align 4
  %add.26 = fadd fast float %26, %add.25
  %arrayidx.27 = getelementptr inbounds float, float* %x, i64 27
  %27 = load float, float* %arrayidx.27, align 4
  %add.27 = fadd fast float %27, %add.26
  %arrayidx.28 = getelementptr inbounds float, float* %x, i64 28
  %28 = load float, float* %arrayidx.28, align 4
  %add.28 = fadd fast float %28, %add.27
  %arrayidx.29 = getelementptr inbounds float, float* %x, i64 29
  %29 = load float, float* %arrayidx.29, align 4
  %add.29 = fadd fast float %29, %add.28
  %arrayidx.30 = getelementptr inbounds float, float* %x, i64 30
  %30 = load float, float* %arrayidx.30, align 4
  %add.30 = fadd fast float %30, %add.29
  %arrayidx.31 = getelementptr inbounds float, float* %x, i64 31
  %31 = load float, float* %arrayidx.31, align 4
  %add.31 = fadd fast float %31, %add.30
  %arrayidx.32 = getelementptr inbounds float, float* %x, i64 32
  %32 = load float, float* %arrayidx.32, align 4
  %add.32 = fadd fast float %32, %add.31
  %arrayidx.33 = getelementptr inbounds float, float* %x, i64 33
  %33 = load float, float* %arrayidx.33, align 4
  %add.33 = fadd fast float %33, %add.32
  %arrayidx.34 = getelementptr inbounds float, float* %x, i64 34
  %34 = load float, float* %arrayidx.34, align 4
  %add.34 = fadd fast float %34, %add.33
  %arrayidx.35 = getelementptr inbounds float, float* %x, i64 35
  %35 = load float, float* %arrayidx.35, align 4
  %add.35 = fadd fast float %35, %add.34
  %arrayidx.36 = getelementptr inbounds float, float* %x, i64 36
  %36 = load float, float* %arrayidx.36, align 4
  %add.36 = fadd fast float %36, %add.35
  %arrayidx.37 = getelementptr inbounds float, float* %x, i64 37
  %37 = load float, float* %arrayidx.37, align 4
  %add.37 = fadd fast float %37, %add.36
  %arrayidx.38 = getelementptr inbounds float, float* %x, i64 38
  %38 = load float, float* %arrayidx.38, align 4
  %add.38 = fadd fast float %38, %add.37
  %arrayidx.39 = getelementptr inbounds float, float* %x, i64 39
  %39 = load float, float* %arrayidx.39, align 4
  %add.39 = fadd fast float %39, %add.38
  %arrayidx.40 = getelementptr inbounds float, float* %x, i64 40
  %40 = load float, float* %arrayidx.40, align 4
  %add.40 = fadd fast float %40, %add.39
  %arrayidx.41 = getelementptr inbounds float, float* %x, i64 41
  %41 = load float, float* %arrayidx.41, align 4
  %add.41 = fadd fast float %41, %add.40
  %arrayidx.42 = getelementptr inbounds float, float* %x, i64 42
  %42 = load float, float* %arrayidx.42, align 4
  %add.42 = fadd fast float %42, %add.41
  %arrayidx.43 = getelementptr inbounds float, float* %x, i64 43
  %43 = load float, float* %arrayidx.43, align 4
  %add.43 = fadd fast float %43, %add.42
  %arrayidx.44 = getelementptr inbounds float, float* %x, i64 44
  %44 = load float, float* %arrayidx.44, align 4
  %add.44 = fadd fast float %44, %add.43
  %arrayidx.45 = getelementptr inbounds float, float* %x, i64 45
  %45 = load float, float* %arrayidx.45, align 4
  %add.45 = fadd fast float %45, %add.44
  %arrayidx.46 = getelementptr inbounds float, float* %x, i64 46
  %46 = load float, float* %arrayidx.46, align 4
  %add.46 = fadd fast float %46, %add.45
  %arrayidx.47 = getelementptr inbounds float, float* %x, i64 47
  %47 = load float, float* %arrayidx.47, align 4
  %add.47 = fadd fast float %47, %add.46
  ret float %add.47
}

define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
; CHECK-LABEL: @f1(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
; CHECK-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
; CHECK-NEXT:    ret float [[OP_EXTRA]]
;
; THRESHOLD-LABEL: @f1(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
; THRESHOLD-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
; THRESHOLD-NEXT:    ret float [[OP_EXTRA]]
;
  entry:
  %rem = srem i32 %a, %b
  %conv = sitofp i32 %rem to float
  %0 = load float, float* %x, align 4
  %add = fadd fast float %0, %conv
  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 1
  %1 = load float, float* %arrayidx.1, align 4
  %add.1 = fadd fast float %1, %add
  %arrayidx.2 = getelementptr inbounds float, float* %x, i64 2
  %2 = load float, float* %arrayidx.2, align 4
  %add.2 = fadd fast float %2, %add.1
  %arrayidx.3 = getelementptr inbounds float, float* %x, i64 3
  %3 = load float, float* %arrayidx.3, align 4
  %add.3 = fadd fast float %3, %add.2
  %arrayidx.4 = getelementptr inbounds float, float* %x, i64 4
  %4 = load float, float* %arrayidx.4, align 4
  %add.4 = fadd fast float %4, %add.3
  %arrayidx.5 = getelementptr inbounds float, float* %x, i64 5
  %5 = load float, float* %arrayidx.5, align 4
  %add.5 = fadd fast float %5, %add.4
  %arrayidx.6 = getelementptr inbounds float, float* %x, i64 6
  %6 = load float, float* %arrayidx.6, align 4
  %add.6 = fadd fast float %6, %add.5
  %arrayidx.7 = getelementptr inbounds float, float* %x, i64 7
  %7 = load float, float* %arrayidx.7, align 4
  %add.7 = fadd fast float %7, %add.6
  %arrayidx.8 = getelementptr inbounds float, float* %x, i64 8
  %8 = load float, float* %arrayidx.8, align 4
  %add.8 = fadd fast float %8, %add.7
  %arrayidx.9 = getelementptr inbounds float, float* %x, i64 9
  %9 = load float, float* %arrayidx.9, align 4
  %add.9 = fadd fast float %9, %add.8
  %arrayidx.10 = getelementptr inbounds float, float* %x, i64 10
  %10 = load float, float* %arrayidx.10, align 4
  %add.10 = fadd fast float %10, %add.9
  %arrayidx.11 = getelementptr inbounds float, float* %x, i64 11
  %11 = load float, float* %arrayidx.11, align 4
  %add.11 = fadd fast float %11, %add.10
  %arrayidx.12 = getelementptr inbounds float, float* %x, i64 12
  %12 = load float, float* %arrayidx.12, align 4
  %add.12 = fadd fast float %12, %add.11
  %arrayidx.13 = getelementptr inbounds float, float* %x, i64 13
  %13 = load float, float* %arrayidx.13, align 4
  %add.13 = fadd fast float %13, %add.12
  %arrayidx.14 = getelementptr inbounds float, float* %x, i64 14
  %14 = load float, float* %arrayidx.14, align 4
  %add.14 = fadd fast float %14, %add.13
  %arrayidx.15 = getelementptr inbounds float, float* %x, i64 15
  %15 = load float, float* %arrayidx.15, align 4
  %add.15 = fadd fast float %15, %add.14
  %arrayidx.16 = getelementptr inbounds float, float* %x, i64 16
  %16 = load float, float* %arrayidx.16, align 4
  %add.16 = fadd fast float %16, %add.15
  %arrayidx.17 = getelementptr inbounds float, float* %x, i64 17
  %17 = load float, float* %arrayidx.17, align 4
  %add.17 = fadd fast float %17, %add.16
  %arrayidx.18 = getelementptr inbounds float, float* %x, i64 18
  %18 = load float, float* %arrayidx.18, align 4
  %add.18 = fadd fast float %18, %add.17
  %arrayidx.19 = getelementptr inbounds float, float* %x, i64 19
  %19 = load float, float* %arrayidx.19, align 4
  %add.19 = fadd fast float %19, %add.18
  %arrayidx.20 = getelementptr inbounds float, float* %x, i64 20
  %20 = load float, float* %arrayidx.20, align 4
  %add.20 = fadd fast float %20, %add.19
  %arrayidx.21 = getelementptr inbounds float, float* %x, i64 21
  %21 = load float, float* %arrayidx.21, align 4
  %add.21 = fadd fast float %21, %add.20
  %arrayidx.22 = getelementptr inbounds float, float* %x, i64 22
  %22 = load float, float* %arrayidx.22, align 4
  %add.22 = fadd fast float %22, %add.21
  %arrayidx.23 = getelementptr inbounds float, float* %x, i64 23
  %23 = load float, float* %arrayidx.23, align 4
  %add.23 = fadd fast float %23, %add.22
  %arrayidx.24 = getelementptr inbounds float, float* %x, i64 24
  %24 = load float, float* %arrayidx.24, align 4
  %add.24 = fadd fast float %24, %add.23
  %arrayidx.25 = getelementptr inbounds float, float* %x, i64 25
  %25 = load float, float* %arrayidx.25, align 4
  %add.25 = fadd fast float %25, %add.24
  %arrayidx.26 = getelementptr inbounds float, float* %x, i64 26
  %26 = load float, float* %arrayidx.26, align 4
  %add.26 = fadd fast float %26, %add.25
  %arrayidx.27 = getelementptr inbounds float, float* %x, i64 27
  %27 = load float, float* %arrayidx.27, align 4
  %add.27 = fadd fast float %27, %add.26
  %arrayidx.28 = getelementptr inbounds float, float* %x, i64 28
  %28 = load float, float* %arrayidx.28, align 4
  %add.28 = fadd fast float %28, %add.27
  %arrayidx.29 = getelementptr inbounds float, float* %x, i64 29
  %29 = load float, float* %arrayidx.29, align 4
  %add.29 = fadd fast float %29, %add.28
  %arrayidx.30 = getelementptr inbounds float, float* %x, i64 30
  %30 = load float, float* %arrayidx.30, align 4
  %add.30 = fadd fast float %30, %add.29
  %arrayidx.31 = getelementptr inbounds float, float* %x, i64 31
  %31 = load float, float* %arrayidx.31, align 4
  %add.31 = fadd fast float %31, %add.30
  ret float %add.31
}

define float @loadadd31(float* nocapture readonly %x) {
; CHECK-LABEL: @loadadd31(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]]
; CHECK-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]]
; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
; CHECK-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]]
; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0
; CHECK-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
; CHECK-NEXT:    ret float [[TMP12]]
;
; THRESHOLD-LABEL: @loadadd31(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; THRESHOLD-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
; THRESHOLD-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
; THRESHOLD-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
; THRESHOLD-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]]
; THRESHOLD-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]]
; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
; THRESHOLD-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]]
; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0
; THRESHOLD-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
; THRESHOLD-NEXT:    ret float [[TMP12]]
;
  entry:
  %arrayidx = getelementptr inbounds float, float* %x, i64 1
  %0 = load float, float* %arrayidx, align 4
  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 2
  %1 = load float, float* %arrayidx.1, align 4
  %add.1 = fadd fast float %1, %0
  %arrayidx.2 = getelementptr inbounds float, float* %x, i64 3
  %2 = load float, float* %arrayidx.2, align 4
  %add.2 = fadd fast float %2, %add.1
  %arrayidx.3 = getelementptr inbounds float, float* %x, i64 4
  %3 = load float, float* %arrayidx.3, align 4
  %add.3 = fadd fast float %3, %add.2
  %arrayidx.4 = getelementptr inbounds float, float* %x, i64 5
  %4 = load float, float* %arrayidx.4, align 4
  %add.4 = fadd fast float %4, %add.3
  %arrayidx.5 = getelementptr inbounds float, float* %x, i64 6
  %5 = load float, float* %arrayidx.5, align 4
  %add.5 = fadd fast float %5, %add.4
  %arrayidx.6 = getelementptr inbounds float, float* %x, i64 7
  %6 = load float, float* %arrayidx.6, align 4
  %add.6 = fadd fast float %6, %add.5
  %arrayidx.7 = getelementptr inbounds float, float* %x, i64 8
  %7 = load float, float* %arrayidx.7, align 4
  %add.7 = fadd fast float %7, %add.6
  %arrayidx.8 = getelementptr inbounds float, float* %x, i64 9
  %8 = load float, float* %arrayidx.8, align 4
  %add.8 = fadd fast float %8, %add.7
  %arrayidx.9 = getelementptr inbounds float, float* %x, i64 10
  %9 = load float, float* %arrayidx.9, align 4
  %add.9 = fadd fast float %9, %add.8
  %arrayidx.10 = getelementptr inbounds float, float* %x, i64 11
  %10 = load float, float* %arrayidx.10, align 4
  %add.10 = fadd fast float %10, %add.9
  %arrayidx.11 = getelementptr inbounds float, float* %x, i64 12
  %11 = load float, float* %arrayidx.11, align 4
  %add.11 = fadd fast float %11, %add.10
  %arrayidx.12 = getelementptr inbounds float, float* %x, i64 13
  %12 = load float, float* %arrayidx.12, align 4
  %add.12 = fadd fast float %12, %add.11
  %arrayidx.13 = getelementptr inbounds float, float* %x, i64 14
  %13 = load float, float* %arrayidx.13, align 4
  %add.13 = fadd fast float %13, %add.12
  %arrayidx.14 = getelementptr inbounds float, float* %x, i64 15
  %14 = load float, float* %arrayidx.14, align 4
  %add.14 = fadd fast float %14, %add.13
  %arrayidx.15 = getelementptr inbounds float, float* %x, i64 16
  %15 = load float, float* %arrayidx.15, align 4
  %add.15 = fadd fast float %15, %add.14
  %arrayidx.16 = getelementptr inbounds float, float* %x, i64 17
  %16 = load float, float* %arrayidx.16, align 4
  %add.16 = fadd fast float %16, %add.15
  %arrayidx.17 = getelementptr inbounds float, float* %x, i64 18
  %17 = load float, float* %arrayidx.17, align 4
  %add.17 = fadd fast float %17, %add.16
  %arrayidx.18 = getelementptr inbounds float, float* %x, i64 19
  %18 = load float, float* %arrayidx.18, align 4
  %add.18 = fadd fast float %18, %add.17
  %arrayidx.19 = getelementptr inbounds float, float* %x, i64 20
  %19 = load float, float* %arrayidx.19, align 4
  %add.19 = fadd fast float %19, %add.18
  %arrayidx.20 = getelementptr inbounds float, float* %x, i64 21
  %20 = load float, float* %arrayidx.20, align 4
  %add.20 = fadd fast float %20, %add.19
  %arrayidx.21 = getelementptr inbounds float, float* %x, i64 22
  %21 = load float, float* %arrayidx.21, align 4
  %add.21 = fadd fast float %21, %add.20
  %arrayidx.22 = getelementptr inbounds float, float* %x, i64 23
  %22 = load float, float* %arrayidx.22, align 4
  %add.22 = fadd fast float %22, %add.21
  %arrayidx.23 = getelementptr inbounds float, float* %x, i64 24
  %23 = load float, float* %arrayidx.23, align 4
  %add.23 = fadd fast float %23, %add.22
  %arrayidx.24 = getelementptr inbounds float, float* %x, i64 25
  %24 = load float, float* %arrayidx.24, align 4
  %add.24 = fadd fast float %24, %add.23
  %arrayidx.25 = getelementptr inbounds float, float* %x, i64 26
  %25 = load float, float* %arrayidx.25, align 4
  %add.25 = fadd fast float %25, %add.24
  %arrayidx.26 = getelementptr inbounds float, float* %x, i64 27
  %26 = load float, float* %arrayidx.26, align 4
  %add.26 = fadd fast float %26, %add.25
  %arrayidx.27 = getelementptr inbounds float, float* %x, i64 28
  %27 = load float, float* %arrayidx.27, align 4
  %add.27 = fadd fast float %27, %add.26
  %arrayidx.28 = getelementptr inbounds float, float* %x, i64 29
  %28 = load float, float* %arrayidx.28, align 4
  %add.28 = fadd fast float %28, %add.27
  %arrayidx.29 = getelementptr inbounds float, float* %x, i64 30
  %29 = load float, float* %arrayidx.29, align 4
  %add.29 = fadd fast float %29, %add.28
  ret float %add.29
}

define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
; CHECK-LABEL: @extra_args(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
; CHECK-NEXT:    ret float [[OP_EXTRA5]]
;
; THRESHOLD-LABEL: @extra_args(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
;
  entry:
  %mul = mul nsw i32 %b, %a
  %conv = sitofp i32 %mul to float
  %0 = load float, float* %x, align 4
  %add = fadd fast float %conv, 3.000000e+00
  %add1 = fadd fast float %0, %add
  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
  %1 = load float, float* %arrayidx3, align 4
  %add4 = fadd fast float %1, %add1
  %add5 = fadd fast float %add4, %conv
  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
  %2 = load float, float* %arrayidx3.1, align 4
  %add4.1 = fadd fast float %2, %add5
  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
  %3 = load float, float* %arrayidx3.2, align 4
  %add4.2 = fadd fast float %3, %add4.1
  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
  %4 = load float, float* %arrayidx3.3, align 4
  %add4.3 = fadd fast float %4, %add4.2
  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
  %5 = load float, float* %arrayidx3.4, align 4
  %add4.4 = fadd fast float %5, %add4.3
  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
  %6 = load float, float* %arrayidx3.5, align 4
  %add4.5 = fadd fast float %6, %add4.4
  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
  %7 = load float, float* %arrayidx3.6, align 4
  %add4.6 = fadd fast float %7, %add4.5
  ret float %add4.6
}

define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a, i32 %b) {
; CHECK-LABEL: @extra_args_same_several_times(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
; CHECK-NEXT:    ret float [[OP_EXTRA7]]
;
; THRESHOLD-LABEL: @extra_args_same_several_times(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
; THRESHOLD-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
; THRESHOLD-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
; THRESHOLD-NEXT:    ret float [[OP_EXTRA7]]
;
  entry:
  %mul = mul nsw i32 %b, %a
  %conv = sitofp i32 %mul to float
  %0 = load float, float* %x, align 4
  %add = fadd fast float %conv, 3.000000e+00
  %add1 = fadd fast float %0, %add
  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
  %1 = load float, float* %arrayidx3, align 4
  %add4 = fadd fast float %1, %add1
  %add41 = fadd fast float %add4, 5.000000e+00
  %add5 = fadd fast float %add41, %conv
  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
  %2 = load float, float* %arrayidx3.1, align 4
  %add4.1 = fadd fast float %2, %add5
  %add4.11 = fadd fast float %add4.1, 5.000000e+00
  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
  %3 = load float, float* %arrayidx3.2, align 4
  %add4.2 = fadd fast float %3, %add4.11
  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
  %4 = load float, float* %arrayidx3.3, align 4
  %add4.3 = fadd fast float %4, %add4.2
  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
  %5 = load float, float* %arrayidx3.4, align 4
  %add4.4 = fadd fast float %5, %add4.3
  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
  %6 = load float, float* %arrayidx3.5, align 4
  %add4.5 = fadd fast float %6, %add4.4
  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
  %7 = load float, float* %arrayidx3.6, align 4
  %add4.6 = fadd fast float %7, %add4.5
  ret float %add4.6
}

define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b, i32 %c) {
; CHECK-LABEL: @extra_args_no_replace(
; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; CHECK-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
; CHECK-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
; CHECK-NEXT:    ret float [[OP_EXTRA5]]
;
; THRESHOLD-LABEL: @extra_args_no_replace(
; THRESHOLD-NEXT:  entry:
; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
; THRESHOLD-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
; THRESHOLD-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
;
  entry:
  %mul = mul nsw i32 %b, %a
  %conv = sitofp i32 %mul to float
  %0 = load float, float* %x, align 4
  %convc = sitofp i32 %c to float
  %addc = fadd fast float %convc, 3.000000e+00
  %add = fadd fast float %conv, %addc
  %add1 = fadd fast float %0, %add
  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
  %1 = load float, float* %arrayidx3, align 4
  %add4 = fadd fast float %1, %add1
  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
  %2 = load float, float* %arrayidx3.1, align 4
  %add4.1 = fadd fast float %2, %add4
  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
  %3 = load float, float* %arrayidx3.2, align 4
  %add4.2 = fadd fast float %3, %add4.1
  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
  %4 = load float, float* %arrayidx3.3, align 4
  %add4.3 = fadd fast float %4, %add4.2
  %add5 = fadd fast float %add4.3, %conv
  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
  %5 = load float, float* %arrayidx3.4, align 4
  %add4.4 = fadd fast float %5, %add5
  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
  %6 = load float, float* %arrayidx3.5, align 4
  %add4.5 = fadd fast float %6, %add4.4
  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
  %7 = load float, float* %arrayidx3.6, align 4
  %add4.6 = fadd fast float %7, %add4.5
  ret float %add4.6
}

define i32 @wobble(i32 %arg, i32 %bar) {
; CHECK-LABEL: @wobble(
; CHECK-NEXT:  bb:
; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0
; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1
; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2
; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3
; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0
; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1
; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2
; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3
; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
; CHECK-NEXT:    ret i32 [[OP_EXTRA3]]
;
; THRESHOLD-LABEL: @wobble(
; THRESHOLD-NEXT:  bb:
; THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0
; THRESHOLD-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1
; THRESHOLD-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2
; THRESHOLD-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3
; THRESHOLD-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0
; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1
; THRESHOLD-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2
; THRESHOLD-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3
; THRESHOLD-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
; THRESHOLD-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
; THRESHOLD-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
; THRESHOLD-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
; THRESHOLD-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
; THRESHOLD-NEXT:    ret i32 [[OP_EXTRA3]]
;
  bb:
  %x1 = xor i32 %arg, %bar
  %i1 = icmp eq i32 %x1, 0
  %s1 = sext i1 %i1 to i32
  %x2 = xor i32 %arg, %bar
  %i2 = icmp eq i32 %x2, 0
  %s2 = sext i1 %i2 to i32
  %x3 = xor i32 %arg, %bar
  %i3 = icmp eq i32 %x3, 0
  %s3 = sext i1 %i3 to i32
  %x4 = xor i32 %arg, %bar
  %i4 = icmp eq i32 %x4, 0
  %s4 = sext i1 %i4 to i32
  %r1 = add nuw i32 %arg, %s1
  %r2 = add nsw i32 %r1, %s2
  %r3 = add nsw i32 %r2, %s3
  %r4 = add nsw i32 %r3, %s4
  %r5 = add nsw i32 %r4, %x4
  ret i32 %r5
}