; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @b = common global [2048 x i32] zeroinitializer, align 16 @c = common global [2048 x i32] zeroinitializer, align 16 @a = common global [2048 x i32] zeroinitializer, align 16 @G = common global [32 x [1024 x i32]] zeroinitializer, align 16 @ub = common global [1024 x i32] zeroinitializer, align 16 @uc = common global [1024 x i32] zeroinitializer, align 16 @d = common global [2048 x i32] zeroinitializer, align 16 @fa = common global [1024 x float] zeroinitializer, align 16 @fb = common global [1024 x float] zeroinitializer, align 16 @ic = common global [1024 x i32] zeroinitializer, align 16 @da = common global [1024 x float] zeroinitializer, align 16 @db = common global [1024 x float] zeroinitializer, align 16 @dc = common global [1024 x float] zeroinitializer, align 16 @dd = common global [1024 x float] zeroinitializer, align 16 @dj = common global [1024 x i32] zeroinitializer, align 16 ; We can optimize this test without a tail. define void @example1() optsize { ; CHECK-LABEL: @example1( ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 16 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 16 ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[TMP9:%.*]] ; CHECK: br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop !2 ; CHECK: ret void ; br label %1 ;