diff options
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll')
-rw-r--r-- | llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll | 631 |
1 files changed, 631 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll index 9217c2ec9a7..fb9d29e34a1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -1,5 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 +; RUN: opt < %s -O3 -mcpu=knl -force-vector-width=2 -S | FileCheck %s -check-prefix=FVW2 + +; With a force-vector-width, it is sometimes more profitable to generate +; scalarized and predicated stores instead of masked scatter. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc_linux" @@ -87,6 +91,73 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger ; AVX512: for.end: ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo1( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 +; FVW2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 +; FVW2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 +; FVW2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 +; FVW2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; FVW2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4 +; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer +; FVW2-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD10]], zeroinitializer +; FVW2-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD11]], zeroinitializer +; FVW2-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD12]], zeroinitializer +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> undef) +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2 +; FVW2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> undef) +; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4 +; FVW2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> undef) +; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 6 +; FVW2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> undef) +; FVW2-NEXT: [[TMP20:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64> +; FVW2-NEXT: [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD13]] to <2 x i64> +; FVW2-NEXT: [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD14]] to <2 x i64> +; FVW2-NEXT: [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD15]] to <2 x i64> +; FVW2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP20]] +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP21]] +; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP22]] +; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP23]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP24]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER17:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER18:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef) +; FVW2-NEXT: [[TMP28:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER17]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER18]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]]) +; FVW2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 2 +; FVW2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]]) +; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 4 +; FVW2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]]) +; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 6 +; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]]) +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; FVW2-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca float*, align 8 %out.addr = alloca float*, align 8 @@ -290,6 +361,118 @@ define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %tr ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo2( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64> +; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96> +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; FVW2: pred.store.if17: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]] +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] +; FVW2: pred.store.continue18: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; FVW2: pred.store.if19: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]] +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] +; FVW2: pred.store.continue20: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; FVW2: pred.store.if21: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]] +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] +; FVW2: pred.store.continue22: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; FVW2: pred.store.if23: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]] +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] +; FVW2: pred.store.continue24: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; FVW2: pred.store.if25: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]] +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] +; FVW2: pred.store.continue26: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; FVW2: pred.store.if27: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]] +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] +; FVW2: pred.store.continue28: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.if29: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]] +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.continue30: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128> +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !2 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In*, align 8 %out.addr = alloca float*, align 8 @@ -494,6 +677,118 @@ define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noali ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo3( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE29:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE29]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> +; FVW2-NEXT: [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64> +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96> +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD6]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER9]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD6]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER12]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; FVW2: pred.store.if16: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP20]], i32 1 +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE17]] +; FVW2: pred.store.continue17: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; FVW2: pred.store.if18: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP24]], i32 1 +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE19]] +; FVW2: pred.store.continue19: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] +; FVW2: pred.store.if20: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP28]], i32 1 +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE21]] +; FVW2: pred.store.continue21: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; FVW2: pred.store.if22: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP32]], i32 1 +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE23]] +; FVW2: pred.store.continue23: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; FVW2: pred.store.if24: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP36]], i32 1 +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE25]] +; FVW2: pred.store.continue25: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; FVW2: pred.store.if26: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP40]], i32 1 +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE27]] +; FVW2: pred.store.continue27: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29]] +; FVW2: pred.store.if28: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP44]], i32 1 +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE29]] +; FVW2: pred.store.continue29: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128> +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !3 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In*, align 8 %out.addr = alloca %struct.Out*, align 8 @@ -684,6 +979,118 @@ define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspac ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo2_addrspace( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64> +; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96> +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; FVW2: pred.store.if17: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]] +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] +; FVW2: pred.store.continue18: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; FVW2: pred.store.if19: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]] +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] +; FVW2: pred.store.continue20: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; FVW2: pred.store.if21: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]] +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] +; FVW2: pred.store.continue22: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; FVW2: pred.store.if23: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]] +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] +; FVW2: pred.store.continue24: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; FVW2: pred.store.if25: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]] +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] +; FVW2: pred.store.continue26: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; FVW2: pred.store.if27: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]] +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] +; FVW2: pred.store.continue28: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.if29: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]] +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.continue30: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128> +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In addrspace(1)*, align 8 %out.addr = alloca float addrspace(1)*, align 8 @@ -874,6 +1281,118 @@ define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspa ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo2_addrspace2( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64> +; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96> +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; FVW2: pred.store.if17: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]] +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] +; FVW2: pred.store.continue18: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; FVW2: pred.store.if19: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]] +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] +; FVW2: pred.store.continue20: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; FVW2: pred.store.if21: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]] +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] +; FVW2: pred.store.continue22: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; FVW2: pred.store.if23: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]] +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] +; FVW2: pred.store.continue24: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; FVW2: pred.store.if25: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]] +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] +; FVW2: pred.store.continue26: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; FVW2: pred.store.if27: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]] +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] +; FVW2: pred.store.continue28: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.if29: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]] +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.continue30: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128> +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In addrspace(1)*, align 8 %out.addr = alloca float addrspace(0)*, align 8 @@ -1064,6 +1583,118 @@ define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspa ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo2_addrspace3( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32> +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64> +; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96> +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01> +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; FVW2: pred.store.if17: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]] +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] +; FVW2: pred.store.continue18: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; FVW2: pred.store.if19: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]] +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] +; FVW2: pred.store.continue20: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; FVW2: pred.store.if21: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]] +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] +; FVW2: pred.store.continue22: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; FVW2: pred.store.if23: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]] +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] +; FVW2: pred.store.continue24: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; FVW2: pred.store.if25: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]] +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] +; FVW2: pred.store.continue26: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; FVW2: pred.store.if27: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]] +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] +; FVW2: pred.store.continue28: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.if29: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]] +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.continue30: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128> +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In addrspace(0)*, align 8 %out.addr = alloca float addrspace(1)*, align 8 |