summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/Thumb2/LowOverheadLoops
diff options
context:
space:
mode:
authorSam Parker <sam.parker@arm.com>2019-11-18 17:07:56 +0000
committerSam Parker <sam.parker@arm.com>2019-11-19 08:22:18 +0000
commit8978c12b39f90194bb35860729ddca5e819f3b92 (patch)
treedd60953f653866f6508e4a083c4758d0de834b51 /llvm/test/CodeGen/Thumb2/LowOverheadLoops
parentd593292f0465c9db1f2c3cdf719009bfdf942a5c (diff)
downloadbcm5719-llvm-8978c12b39f90194bb35860729ddca5e819f3b92.tar.gz
bcm5719-llvm-8978c12b39f90194bb35860729ddca5e819f3b92.zip
[ARM][MVE] Tail predication conversion
This patch modifies ARMLowOverheadLoops to convert a predicated vector low-overhead loop into a tail-predicatd one. This is currently a very basic conversion, with the following restrictions: - Operates only on single block loops. - The loop can only contain a single vctp instruction. - No other instructions can write to the vpr. - We only allow a subset of the mve instructions in the loop. TODO: Pass the number of elements, not the number of iterations to dlstp/wlstp. Differential Revision: https://reviews.llvm.org/D69945
Diffstat (limited to 'llvm/test/CodeGen/Thumb2/LowOverheadLoops')
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll13
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll21
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll181
3 files changed, 163 insertions, 52 deletions
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 1b4b040573a..02d05ef9c0f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -40,21 +40,18 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
; CHECK-NEXT: bic r6, r6, #3
; CHECK-NEXT: subs r6, #4
; CHECK-NEXT: add.w lr, r12, r6, lsr #2
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: dlstp.32 lr, lr
; CHECK-NEXT: .LBB0_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vctp.32 r3
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q0, [r1]
-; CHECK-NEXT: vldrwt.u32 q1, [r2]
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vmul.f32 q0, q1, q0
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrwt.32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: adds r2, #16
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: subs r3, #4
-; CHECK-NEXT: le lr, .LBB0_5
+; CHECK-NEXT: letp lr, .LBB0_5
; CHECK-NEXT: b .LBB0_11
; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new
; CHECK-NEXT: subs r3, r3, r7
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index f5db56c2fa3..6343f68dd9a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -355,18 +355,16 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: dlstp.32 lr, lr
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vctp.32 r2
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q2, [r1]
+; CHECK-NEXT: vldrw.u32 q2, [r1]
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
-; CHECK-NEXT: le lr, .LBB4_1
+; CHECK-NEXT: letp lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vpsel q0, q0, q1
@@ -1132,22 +1130,19 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
; CHECK-NEXT: bic r4, r4, #3
; CHECK-NEXT: subs r4, #4
; CHECK-NEXT: add.w lr, lr, r4, lsr #2
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: dlstp.32 lr, lr
; CHECK-NEXT: .LBB9_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vctp.32 r12
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-NEXT: vldrwt.u32 q1, [r1]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: vmul.i32 q0, q1, q0
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: vadd.i32 q0, q0, r2
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrwt.32 q0, [r3]
+; CHECK-NEXT: vstrw.32 q0, [r3]
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: adds r3, #16
; CHECK-NEXT: sub.w r12, r12, #4
-; CHECK-NEXT: le lr, .LBB9_5
+; CHECK-NEXT: letp lr, .LBB9_5
; CHECK-NEXT: b .LBB9_11
; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new
; CHECK-NEXT: sub.w r7, r12, r5
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index 5659c9f97b7..02bf12ce620 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -15,21 +15,19 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
; CHECK-NEXT: sub.w r12, r3, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: dlstp.32 lr, lr
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vctp.32 r2
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-NEXT: vldrwt.u32 q2, [r1]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r1]
; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: vmul.i32 q0, q2, q0
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
-; CHECK-NEXT: le lr, .LBB0_1
+; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vpsel q0, q0, q1
@@ -90,18 +88,16 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: dlstp.32 lr, lr
; CHECK-NEXT: .LBB1_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov r1, r2
-; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
-; CHECK-NEXT: le lr, .LBB1_1
+; CHECK-NEXT: letp lr, .LBB1_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: vctp.32 r1
; CHECK-NEXT: vpsel q0, q0, q1
@@ -158,18 +154,16 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: add.w lr, r3, r1, lsr #2
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: dlstp.32 lr, lr
; CHECK-NEXT: .LBB2_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov r1, r2
-; CHECK-NEXT: vctp.32 r2
; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
-; CHECK-NEXT: le lr, .LBB2_1
+; CHECK-NEXT: letp lr, .LBB2_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: vctp.32 r1
; CHECK-NEXT: vpsel q0, q0, q1
@@ -224,19 +218,16 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
; CHECK-NEXT: bic r12, r12, #3
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: dlstp.32 lr, lr
; CHECK-NEXT: .LBB3_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vctp.32 r3
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vmul.i32 q0, q0, r2
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrwt.32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: subs r3, #4
-; CHECK-NEXT: le lr, .LBB3_1
+; CHECK-NEXT: letp lr, .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
@@ -286,19 +277,16 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
; CHECK-NEXT: bic r12, r12, #3
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: add.w lr, lr, r12, lsr #2
-; CHECK-NEXT: dls lr, lr
+; CHECK-NEXT: dlstp.32 lr, lr
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vctp.32 r3
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vadd.i32 q0, q0, r2
-; CHECK-NEXT: vpst
-; CHECK-NEXT: vstrwt.32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: subs r3, #4
-; CHECK-NEXT: le lr, .LBB4_1
+; CHECK-NEXT: letp lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
entry:
@@ -336,7 +324,138 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
+define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) {
+; CHECK-LABEL: vector_mul_vector_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: cmp r3, #0
+; CHECK-NEXT: it eq
+; CHECK-NEXT: popeq {r4, pc}
+; CHECK-NEXT: add.w r12, r3, #15
+; CHECK-NEXT: mov.w lr, #1
+; CHECK-NEXT: bic r12, r12, #15
+; CHECK-NEXT: sub.w r12, r12, #16
+; CHECK-NEXT: add.w lr, lr, r12, lsr #4
+; CHECK-NEXT: mov.w r12, #0
+; CHECK-NEXT: dlstp.8 lr, lr
+; CHECK-NEXT: .LBB5_1: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add.w r4, r1, r12
+; CHECK-NEXT: vldrb.u8 q0, [r4]
+; CHECK-NEXT: add.w r4, r2, r12
+; CHECK-NEXT: vldrb.u8 q1, [r4]
+; CHECK-NEXT: add.w r4, r0, r12
+; CHECK-NEXT: add.w r12, r12, #16
+; CHECK-NEXT: subs r3, #16
+; CHECK-NEXT: vmul.i8 q0, q1, q0
+; CHECK-NEXT: vstrb.8 q0, [r4]
+; CHECK-NEXT: letp lr, .LBB5_1
+; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r4, pc}
+entry:
+ %cmp10 = icmp eq i32 %N, 0
+ br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 15
+ %n.vec = and i32 %n.rnd.up, -16
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
+ %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %0 = getelementptr inbounds i8, i8* %b, i32 %index
+ %1 = icmp ule <16 x i32> %induction, %broadcast.splat13
+ %2 = bitcast i8* %0 to <16 x i8>*
+ %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef)
+ %3 = getelementptr inbounds i8, i8* %c, i32 %index
+ %4 = bitcast i8* %3 to <16 x i8>*
+ %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %1, <16 x i8> undef)
+ %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load
+ %6 = getelementptr inbounds i8, i8* %a, i32 %index
+ %7 = bitcast i8* %6 to <16 x i8>*
+ call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %5, <16 x i8>* %7, i32 1, <16 x i1> %1)
+ %index.next = add i32 %index, 16
+ %8 = icmp eq i32 %index.next, %n.vec
+ br i1 %8, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+}
+
+; Function Attrs: nofree norecurse nounwind
+define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: vector_mul_vector_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: cmp r3, #0
+; CHECK-NEXT: it eq
+; CHECK-NEXT: popeq {r7, pc}
+; CHECK-NEXT: add.w r12, r3, #7
+; CHECK-NEXT: mov.w lr, #1
+; CHECK-NEXT: bic r12, r12, #7
+; CHECK-NEXT: sub.w r12, r12, #8
+; CHECK-NEXT: add.w lr, lr, r12, lsr #3
+; CHECK-NEXT: dlstp.16 lr, lr
+; CHECK-NEXT: .LBB6_1: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrh.u16 q0, [r1]
+; CHECK-NEXT: vldrh.u16 q1, [r2]
+; CHECK-NEXT: vmul.i16 q0, q1, q0
+; CHECK-NEXT: vstrh.16 q0, [r0]
+; CHECK-NEXT: adds r1, #16
+; CHECK-NEXT: adds r2, #16
+; CHECK-NEXT: adds r0, #16
+; CHECK-NEXT: subs r3, #8
+; CHECK-NEXT: letp lr, .LBB6_1
+; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp10 = icmp eq i32 %N, 0
+ br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 7
+ %n.vec = and i32 %n.rnd.up, -8
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+ %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %0 = getelementptr inbounds i16, i16* %b, i32 %index
+ %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
+ %2 = bitcast i16* %0 to <8 x i16>*
+ %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef)
+ %3 = getelementptr inbounds i16, i16* %c, i32 %index
+ %4 = bitcast i16* %3 to <8 x i16>*
+ %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %1, <8 x i16> undef)
+ %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load
+ %6 = getelementptr inbounds i16, i16* %a, i32 %index
+ %7 = bitcast i16* %6 to <8 x i16>*
+ call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %5, <8 x i16>* %7, i32 2, <8 x i1> %1)
+ %index.next = add i32 %index, 8
+ %8 = icmp eq i32 %index.next, %n.vec
+ br i1 %8, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+}
+
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
-declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
OpenPOWER on IntegriCloud