[ARM][MVE] Remove old tail predicates

Remove any predicate that we replace with a vctp intrinsic, and try to remove their operands too. Also look into the exit block to see if there's any duplicates of the predicates that we've replaced and clone the vctp to be used there instead. Differential Revision: https://reviews.llvm.org/D67709 llvm-svn: 372567
author: Sam Parker <sam.parker@arm.com> 2019-09-23 09:48:25 +0000
committer: Sam Parker <sam.parker@arm.com> 2019-09-23 09:48:25 +0000
commit: 9feb429a337ff49fe119a64bff3724fb820c4501 (patch)
tree: 2ab99115a6b989822c1243c41a07497fe1632313 /llvm/test/CodeGen/Thumb2/LowOverheadLoops
parent: 14f6465c157b36c50ffe431463a9c94efda42b99 (diff)
download: bcm5719-llvm-9feb429a337ff49fe119a64bff3724fb820c4501.tar.gz
bcm5719-llvm-9feb429a337ff49fe119a64bff3724fb820c4501.zip
3 files changed, 609 insertions, 0 deletions
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
new file mode 100644
index 00000000000..d701e0f1b57
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -0,0 +1,292 @@
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
+
+; CHECK-LABEL: vpsel_mul_reduce_add
+; CHECK:      dls lr, lr
+; CHECK:  [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vstr p0, [sp
+; CHECK:      vpstt	
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      vcmp.i32
+; CHECK:      vpsel
+; CHECK:      vldr p0, [sp
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32	[[ELEMS]]
+; CHECK-NEXT: vpsel
+; CHECK-NEXT: vaddv.u32
+define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %tmp2 = bitcast i32* %tmp to <4 x i32>*
+  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %rem = urem i32 %index, 16
+  %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
+  %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
+  %wide.masked.load = select <4 x i1> %cmp, <4 x i32> %wide.masked.load.b, <4 x i32> %wide.masked.load.c
+  %mul = mul nsw <4 x i32> %wide.masked.load, %wide.masked.load.a
+  %add = add nsw <4 x i32> %mul, %vec.phi
+  %index.next = add i32 %index, 4
+  %tmp7 = icmp eq i32 %index.next, %n.vec
+  br i1 %tmp7, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %tmp8 = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+  %tmp9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %tmp8)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp9, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: vpsel_mul_reduce_add_2
+; CHECK:      dls lr, lr
+; CHECK:  [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vstr p0, [sp
+; CHECK:      vpstt
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK;      vsub
+; CHECK:      vpst
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      vcmp.i32
+; CHECK:      vpsel
+; CHECK:      vldr p0, [sp
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32	[[ELEMS]]
+; CHECK-NEXT: vpsel
+; CHECK-NEXT: vaddv.u32
+define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+                                         i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %tmp2 = bitcast i32* %tmp to <4 x i32>*
+  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
+  %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
+  %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %sub = sub <4 x i32> %wide.masked.load.c, %wide.masked.load.d
+  %rem = urem i32 %index, 16
+  %rem.broadcast.splatinsert = insertelement <4 x i32> undef, i32 %rem, i32 0
+  %rem.broadcast.splat = shufflevector <4 x i32> %rem.broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %cmp = icmp eq <4 x i32> %rem.broadcast.splat, <i32 0, i32 0, i32 0, i32 0>
+  %sel = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %wide.masked.load.b
+  %mul = mul  <4 x i32> %sel, %wide.masked.load.a
+  %add = add  <4 x i32> %mul, %vec.phi
+  %index.next = add i32 %index, 4
+  %cmp.exit = icmp eq i32 %index.next, %n.vec
+  br i1 %cmp.exit, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: and_mul_reduce_add
+; CHECK:      dls lr, lr
+; CHECK:  [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpstt	
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      vpsttt
+; CHECK-NEXT: vcmpt.i32	eq, {{.*}}, zr
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpsel
+define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+                                         i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %tmp2 = bitcast i32* %tmp to <4 x i32>*
+  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
+  %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
+  %mask = and <4 x i1> %cmp, %tmp1
+  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
+  %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
+  %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  %mul = mul  <4 x i32> %wide.masked.load.c, %wide.masked.load.d
+  %add = add  <4 x i32> %mul, %vec.phi
+  %index.next = add i32 %index, 4
+  %cmp.exit = icmp eq i32 %index.next, %n.vec
+  br i1 %cmp.exit, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; TODO: Why does p0 get reloaded from the stack into p0, just to be vmrs'd?
+; CHECK-LABEL: or_mul_reduce_add
+; CHECK:      dls lr, lr
+; CHECK:  [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vstr p0, [sp
+; CHECK:      vpstt	
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      vcmp.i32	eq, {{.*}}, zr
+; CHECK:      vmrs [[VCMP:r[0-9]+]], p0
+; CHECK:      vldr p0, [sp
+; CHECK:      vmrs [[VCTP:r[0-9]+]], p0
+; CHECK:      orr{{.*}} [[VCMP]], [[VCTP]]
+; CHECK-NEXT: vmsr p0
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpsel
+define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
+                                        i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %add, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
+  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %tmp2 = bitcast i32* %tmp to <4 x i32>*
+  %wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
+  %wide.masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
+  %sub = sub <4 x i32> %wide.masked.load.a, %wide.masked.load.b
+  %cmp = icmp eq <4 x i32> %sub, <i32 0, i32 0, i32 0, i32 0>
+  %mask = or <4 x i1> %cmp, %tmp1
+  %tmp5 = getelementptr inbounds i32, i32* %c, i32 %index
+  %tmp6 = bitcast i32* %tmp5 to <4 x i32>*
+  %wide.masked.load.c = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp6, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  %tmp7 = getelementptr inbounds i32, i32* %d, i32 %index
+  %tmp8 = bitcast i32* %tmp7 to <4 x i32>*
+  %wide.masked.load.d = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %mask, <4 x i32> undef)
+  %mul = mul  <4 x i32> %wide.masked.load.c, %wide.masked.load.d
+  %add = add  <4 x i32> %mul, %vec.phi
+  %index.next = add i32 %index, 4
+  %cmp.exit = icmp eq i32 %index.next, %n.vec
+  br i1 %cmp.exit, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %acc = select <4 x i1> %tmp1, <4 x i32> %add, <4 x i32> %vec.phi
+  %reduce = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %acc)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %reduce, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; Function Attrs: argmemonly nounwind readonly willreturn
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+
+; Function Attrs: nounwind readnone willreturn
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
new file mode 100644
index 00000000000..1612e26e3f7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -0,0 +1,242 @@
+; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -enable-arm-maskedldst=true -disable-mve-tail-predication=false --verify-machineinstrs %s -o - | FileCheck %s
+
+; CHECK-LABEL: mul_reduce_add
+; CHECK:      dls lr,
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32	[[ELEMS]]
+; CHECK:      vpstt	
+; CHECK-NEXT: vldrwt.u32
+; CHECK-NEXT: vldrwt.u32
+; CHECK:      le	lr, [[LOOP]]
+; CHECK:      vctp.32	[[ELEMS]]
+; CHECK:      vpsel
+; CHECK:      vaddv.u32	r0
+define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %6, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %a, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = getelementptr inbounds i32, i32* %b, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %5 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
+  %6 = add nsw <4 x i32> %5, %vec.phi
+  %index.next = add i32 %index, 4
+  %7 = icmp eq i32 %index.next, %n.vec
+  br i1 %7, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %8 = select <4 x i1> %1, <4 x i32> %6, <4 x i32> %vec.phi
+  %9 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %8)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %9, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %a, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
+  %index.next = add i32 %index, 4
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
+  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: add_reduce_add_const
+; CHECK:      dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      subs [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      vadd.i32
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpsel
+define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %a, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = add nsw <4 x i32> %wide.masked.load, %vec.phi
+  %index.next = add i32 %index, 4
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %5 = select <4 x i1> %1, <4 x i32> %3, <4 x i32> %vec.phi
+  %6 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %5)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %6, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+
+; CHECK-LABEL: vector_mul_const
+; CHECK:      dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      subs [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
+; CHECK:      vmul.i32
+; CHECK:      vpst	
+; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
+; CHECK:      le lr, [[LOOP]]
+define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
+  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %b, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat11
+  %4 = getelementptr inbounds i32, i32* %a, i32 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n.vec
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: vector_add_const
+; CHECK:      dls lr, lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      subs [[ELEMS:r[0-9]+]], #4
+; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
+; CHECK:      vadd.i32
+; CHECK:      vpst	
+; CHECK-NEXT: vstrwt.32 q{{.*}}, [r0]
+; CHECK:      le lr, [[LOOP]]
+define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %c, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:                                        ; preds = %entry
+  %n.rnd.up = add i32 %N, 3
+  %n.vec = and i32 %n.rnd.up, -4
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %c, i32 0
+  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %0 = getelementptr inbounds i32, i32* %b, i32 %index
+  %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
+  %2 = bitcast i32* %0 to <4 x i32>*
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %3 = add nsw <4 x i32> %wide.masked.load, %broadcast.splat11
+  %4 = getelementptr inbounds i32, i32* %a, i32 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %5, i32 4, <4 x i1> %1)
+  %index.next = add i32 %index, 4
+  %6 = icmp eq i32 %index.next, %n.vec
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
new file mode 100644
index 00000000000..824f1d5790d
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -0,0 +1,75 @@
+
+; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve %s -S -o - | FileCheck %s
+
+; CHECK-LABEL: vec_mul_reduce_add
+
+; CHECK: vector.body:
+; CHECK-NOT: phi i32 [ 0, %vector.ph ]
+; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
+; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
+; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
+; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
+
+; CHECK: middle.block:
+; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
+; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
+; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
+
+define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  %0 = add i32 %N, 3
+  %1 = lshr i32 %0, 2
+  %2 = shl nuw i32 %1, 2
+  %3 = add i32 %2, -4
+  %4 = lshr i32 %3, 2
+  %5 = add nuw nsw i32 %4, 1
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+  
+vector.ph:                                        ; preds = %entry
+  %trip.count.minus.1 = add i32 %N, -1
+  %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+  %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
+  call void @llvm.set.loop.iterations.i32(i32 %5)
+  br label %vector.body
+  
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %lsr.iv2 = phi i32* [ %scevgep3, %vector.body ], [ %a, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %b, %vector.ph ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
+  %6 = phi i32 [ %5, %vector.ph ], [ %10, %vector.body ]
+  %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
+  %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
+  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+  %7 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
+  %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
+  %8 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
+  %9 = add nsw <4 x i32> %8, %vec.phi
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+  %scevgep3 = getelementptr i32, i32* %lsr.iv2, i32 4
+  %10 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
+  %11 = icmp ne i32 %10, 0
+  br i1 %11, label %vector.body, label %middle.block
+  
+middle.block:                                     ; preds = %vector.body
+  %12 = icmp ule <4 x i32> %induction, %broadcast.splat12
+  %13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi
+  %14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13)
+  br label %for.cond.cleanup
+  
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ]
+  ret i32 %res.0.lcssa
+}
+  
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+declare void @llvm.set.loop.iterations.i32(i32)
+declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+
author	Sam Parker <sam.parker@arm.com>	2019-09-23 09:48:25 +0000
committer	Sam Parker <sam.parker@arm.com>	2019-09-23 09:48:25 +0000
commit	9feb429a337ff49fe119a64bff3724fb820c4501 (patch)
tree	2ab99115a6b989822c1243c41a07497fe1632313 /llvm/test/CodeGen/Thumb2/LowOverheadLoops
parent	14f6465c157b36c50ffe431463a9c94efda42b99 (diff)
download	bcm5719-llvm-9feb429a337ff49fe119a64bff3724fb820c4501.tar.gz bcm5719-llvm-9feb429a337ff49fe119a64bff3724fb820c4501.zip