summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/Thumb2/LowOverheadLoops
diff options
context:
space:
mode:
authorSam Parker <sam.parker@arm.com>2019-09-23 09:48:25 +0000
committerSam Parker <sam.parker@arm.com>2019-09-23 09:48:25 +0000
commit9feb429a337ff49fe119a64bff3724fb820c4501 (patch)
tree2ab99115a6b989822c1243c41a07497fe1632313 /llvm/test/CodeGen/Thumb2/LowOverheadLoops
parent14f6465c157b36c50ffe431463a9c94efda42b99 (diff)
downloadbcm5719-llvm-9feb429a337ff49fe119a64bff3724fb820c4501.tar.gz
bcm5719-llvm-9feb429a337ff49fe119a64bff3724fb820c4501.zip
[ARM][MVE] Remove old tail predicates
Remove any predicate that we replace with a vctp intrinsic, and try to remove their operands too. Also look into the exit block to see if there's any duplicates of the predicates that we've replaced and clone the vctp to be used there instead. Differential Revision: https://reviews.llvm.org/D67709 llvm-svn: 372567
Diffstat (limited to 'llvm/test/CodeGen/Thumb2/LowOverheadLoops')
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll292
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll242
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll75
3 files changed, 609 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
new file mode 100644
index 00000000000..d701e0f1b57
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -0,0 +1,292 @@
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
+
+; CHECK-LABEL: vpsel_mul_reduce_add
+; CHECK: dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vstr p0, [sp
+; CHECK: vpstt
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK: vcmp.i32
+; CHECK: vpsel
+; CHECK: vldr p0, [sp
+; CHECK: vpst
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK: le lr, [[LOOP]]
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK-NEXT: vpsel
+; CHECK-NEXT: vaddv.u32
+define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
+entry:
+ %cmp8 = icmp eq i32 %N, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+ %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+ %tmp2 = bitcast i32* %tmp to <4 x i32>*
+ %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+ %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+ %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+ %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+ %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %rem = urem i32 %index, 16
+ %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
+ %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
+ %wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c
+ %mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a
+ %add = add nsw <4 x i32> %mul, %vec.phi
+ %index.next = add i32 %index, 4
+ %tmp7 = icmp eq i32 %index.next, %n.vec
+ br i1 %tmp7, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+ %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ]
+ ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: vpsel_mul_reduce_add_2
+; CHECK: dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vstr p0, [sp
+; CHECK: vpstt
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK; vsub
+; CHECK: vpst
+; CHECK-NEXT: vldrwt.u32
+; CHECK: vcmp.i32
+; CHECK: vpsel
+; CHECK: vldr p0, [sp
+; CHECK: vpst
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK: le lr, [[LOOP]]
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK-NEXT: vpsel
+; CHECK-NEXT: vaddv.u32
+define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+ i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+entry:
+ %cmp8 = icmp eq i32 %N, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+ %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+ %tmp2 = bitcast i32* %tmp to <4 x i32>*
+ %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+ %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+ %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+ %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+ %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
+ %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
+ %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d
+ %rem = urem i32 %index, 16
+ %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
+ %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
+ %sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b
+ %mul = mul <4 x i32> %sel, %wide.masked.load.a
+ %add = add <4 x i32> %mul, %vec.phi
+ %index.next = add i32 %index, 4
+ %cmp.exit = icmp eq i32 %index.next, %n.vec
+ br i1 %cmp.exit, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+ %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
+ ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: and_mul_reduce_add
+; CHECK: dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vpstt
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK: vpsttt
+; CHECK-NEXT: vcmpt.i32 eq, {{.*}}, zr
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
+; CHECK: le lr, [[LOOP]]
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vpsel
+define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+ i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+entry:
+ %cmp8 = icmp eq i32 %N, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+ %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+ %tmp2 = bitcast i32* %tmp to <4 x i32>*
+ %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+ %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+ %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
+ %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
+ %mask = and <4 x i1> %cmp, %tmp1
+ %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+ %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+ %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
+ %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
+ %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
+ %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
+ %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
+ %add = add <4 x i32> %mul, %vec.phi
+ %index.next = add i32 %index, 4
+ %cmp.exit = icmp eq i32 %index.next, %n.vec
+ br i1 %cmp.exit, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+ %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
+ ret i32 %res.0.lcssa
+}
+
+; TODO: Why does p0 get reloaded from the stack into p0, just to be vmrs'd?
+; CHECK-LABEL: or_mul_reduce_add
+; CHECK: dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vstr p0, [sp
+; CHECK: vpstt
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK: vcmp.i32 eq, {{.*}}, zr
+; CHECK: vmrs [[VCMP:r[0-9]+]], p0
+; CHECK: vldr p0, [sp
+; CHECK: vmrs [[VCTP:r[0-9]+]], p0
+; CHECK: orr{{.*}} [[VCMP]], [[VCTP]]
+; CHECK-NEXT: vmsr p0
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
+; CHECK: le lr, [[LOOP]]
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vpsel
+define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+ i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+entry:
+ %cmp8 = icmp eq i32 %N, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+ %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+ %tmp2 = bitcast i32* %tmp to <4 x i32>*
+ %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+ %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+ %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+ %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
+ %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
+ %mask = or <4 x i1> %cmp, %tmp1
+ %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+ %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+ %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
+ %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
+ %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
+ %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
+ %mul = mul <4 x i32> %wide.masked.load.c, %wide.masked.load.d
+ %add = add <4 x i32> %mul, %vec.phi
+ %index.next = add i32 %index, 4
+ %cmp.exit = icmp eq i32 %index.next, %n.vec
+ br i1 %cmp.exit, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+ %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
+ ret i32 %res.0.lcssa
+}
+
+; Function Attrs: argmemonly nounwind readonly willreturn
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+
+; Function Attrs: nounwind readnone willreturn
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
new file mode 100644
index 00000000000..1612e26e3f7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -0,0 +1,242 @@
+; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
+
+; CHECK-LABEL: mul_reduce_add
+; CHECK: dls lr,
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vpstt
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK: le lr, [[LOOP]]
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vpsel
+; CHECK: vaddv.u32 r0
+define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp8 = icmp eq i32 %N, 0
+ br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %0 = getelementptr inbounds i32, i32* %a, i32 %index
+ %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+ %2 = bitcast i32* %0 to <4 x i32>*
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+ %3 = getelementptr inbounds i32, i32* %b, i32 %index
+ %4 = bitcast i32* %3 to <4 x i32>*
+ %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef)
+ %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
+ %6 = add nsw <4 x i32> %5, %vec.phi
+ %index.next = add i32 %index, 4
+ %7 = icmp eq i32 %index.next, %n.vec
+ br i1 %7, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
+ %9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8)
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ]
+ ret i32 %res.0.lcssa
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
+entry:
+ %cmp6 = icmp eq i32 %N, 0
+ br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %0 = getelementptr inbounds i32, i32* %a, i32 %index
+ %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
+ %2 = bitcast i32* %0 to <4 x i32>*
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+ %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
+ %index.next = add i32 %index, 4
+ %4 = icmp eq i32 %index.next, %n.vec
+ br i1 %4, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
+ %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
+ ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: add_reduce_add_const
+; CHECK: dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK: subs [[ELEMS:r[0-9]+]], #4
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vpst
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK: vadd.i32
+; CHECK: le lr, [[LOOP]]
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vpsel
+define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
+entry:
+ %cmp6 = icmp eq i32 %N, 0
+ br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %0 = getelementptr inbounds i32, i32* %a, i32 %index
+ %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
+ %2 = bitcast i32* %0 to <4 x i32>*
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+ %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
+ %index.next = add i32 %index, 4
+ %4 = icmp eq i32 %index.next, %n.vec
+ br i1 %4, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
+ %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
+ ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: vector_mul_const
+; CHECK: dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK: subs [[ELEMS:r[0-9]+]], #4
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vpst
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
+; CHECK: vmul.i32
+; CHECK: vpst
+; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
+; CHECK: le lr, [[LOOP]]
+define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
+entry:
+ %cmp6 = icmp eq i32 %N, 0
+ br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+ %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
+ %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %0 = getelementptr inbounds i32, i32* %b, i32 %index
+ %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
+ %2 = bitcast i32* %0 to <4 x i32>*
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+ %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
+ %4 = getelementptr inbounds i32, i32* %a, i32 %index
+ %5 = bitcast i32* %4 to <4 x i32>*
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
+ %index.next = add i32 %index, 4
+ %6 = icmp eq i32 %index.next, %n.vec
+ br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+}
+
+; CHECK-LABEL: vector_add_const
+; CHECK: dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK: subs [[ELEMS:r[0-9]+]], #4
+; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vpst
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
+; CHECK: vadd.i32
+; CHECK: vpst
+; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
+; CHECK: le lr, [[LOOP]]
+define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
+entry:
+ %cmp6 = icmp eq i32 %N, 0
+ br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %n.rnd.up = add i32 %N, 3
+ %n.vec = and i32 %n.rnd.up, -4
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+ %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
+ %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %0 = getelementptr inbounds i32, i32* %b, i32 %index
+ %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
+ %2 = bitcast i32* %0 to <4 x i32>*
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+ %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
+ %4 = getelementptr inbounds i32, i32* %a, i32 %index
+ %5 = bitcast i32* %4 to <4 x i32>*
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
+ %index.next = add i32 %index, 4
+ %6 = icmp eq i32 %index.next, %n.vec
+ br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body, %entry
+ ret void
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
new file mode 100644
index 00000000000..824f1d5790d
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -0,0 +1,75 @@
+
+; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s
+
+; CHECK-LABEL: vec_mul_reduce_add
+
+; CHECK: vector.body:
+; CHECK-NOT: phi i32 [ 0, %vector.ph ]
+; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
+; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
+; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
+; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
+
+; CHECK: middle.block:
+; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
+; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
+; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
+
+define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+ %cmp8 = icmp eq i32 %N, 0
+ %0 = add i32 %N, 3
+ %1 = lshr i32 %0, 2
+ %2 = shl nuw i32 %1, 2
+ %3 = add i32 %2, -4
+ %4 = lshr i32 %3, 2
+ %5 = add nuw nsw i32 %4, 1
+ br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph: ; preds = %entry
+ %trip.count.minus.1 = add i32 %N, -1
+ %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+ %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+ call void @llvm.set.loop.iterations.i32(i32 %5)
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ]
+ %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ]
+ %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
+ %6 = phi i32 [ %5, %vector.ph ], [ %10, %vector.body ]
+ %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
+ %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
+ %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+ %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+ %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+ %7 = icmp ule <4 x i32> %induction, %broadcast.splat12
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
+ %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
+ %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
+ %9 = add nsw <4 x i32> %8, %vec.phi
+ %index.next = add i32 %index, 4
+ %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+ %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
+ %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+ %11 = icmp ne i32 %10, 0
+ br i1 %11, label %vector.body, label %middle.block
+
+middle.block: ; preds = %vector.body
+ %12 = icmp ule <4 x i32> %induction, %broadcast.splat12
+ %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi
+ %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13)
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %middle.block, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ]
+ ret i32 %res.0.lcssa
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+
OpenPOWER on IntegriCloud