[ARM] Sink add/mul(shufflevector(insertelement())) for MVE instruction selection

This patch sinks add/mul(shufflevector(insertelement())) into the basic block in which they are used so that they can then be selected together. This is useful for various MVE instructions, such as vmla and others that take R registers. Loop tests have been added to the vmla test file to make sure vmlas are generated in loops. Differential revision: https://reviews.llvm.org/D66295 llvm-svn: 371218
author: Sam Tebbs <sam.tebbs@arm.com> 2019-09-06 16:01:32 +0000
committer: Sam Tebbs <sam.tebbs@arm.com> 2019-09-06 16:01:32 +0000
commit: f1cdd95a2fe79fbcd7fd440509a754bc3afaf088 (patch)
tree: f36f31741b8883c8c19f035cdc26ee3c9f4ee927 /llvm/test/Transforms/CodeGenPrepare
parent: 8d30c1dcec2a935e0b1cffc26fdc6054ff101f53 (diff)
download: bcm5719-llvm-f1cdd95a2fe79fbcd7fd440509a754bc3afaf088.tar.gz
bcm5719-llvm-f1cdd95a2fe79fbcd7fd440509a754bc3afaf088.zip
1 files changed, 216 insertions, 0 deletions
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll
new file mode 100644
index 00000000000..739877b3f0c
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll
@@ -0,0 +1,216 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s
+
+define void @sink_add_mul(i32* %s1, i32 %x, i32* %d, i32 %n) {
+; CHECK-LABEL: @sink_add_mul(
+; CHECK:    vector.ph:
+; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:    vector.body:
+; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat9
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.load10 = load <4 x i32>, <4 x i32>* %4, align 4
+  %5 = add nsw <4 x i32> %wide.load10, %2
+  %6 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %5, <4 x i32>* %6, align 4
+  %index.next = add i32 %index, 4
+  %7 = icmp eq i32 %index.next, %n.vec
+  br i1 %7, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
+
+define void @sink_add_mul_multiple(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2, i32 %n) {
+; CHECK-LABEL: @sink_add_mul_multiple(
+; CHECK:    vector.ph:
+; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:    vector.body:
+; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:      [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat16
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
+  %5 = add nsw <4 x i32> %wide.load17, %2
+  %6 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %5, <4 x i32>* %6, align 4
+  %7 = getelementptr inbounds i32, i32* %s2, i32 %index
+  %8 = bitcast i32* %7 to <4 x i32>*
+  %wide.load18 = load <4 x i32>, <4 x i32>* %8, align 4
+  %9 = mul nsw <4 x i32> %wide.load18, %broadcast.splat16
+  %10 = getelementptr inbounds i32, i32* %d2, i32 %index
+  %11 = bitcast i32* %10 to <4 x i32>*
+  %wide.load19 = load <4 x i32>, <4 x i32>* %11, align 4
+  %12 = add nsw <4 x i32> %wide.load19, %9
+  %13 = bitcast i32* %10 to <4 x i32>*
+  store <4 x i32> %12, <4 x i32>* %13, align 4
+  %index.next = add i32 %index, 4
+  %14 = icmp eq i32 %index.next, %n.vec
+  br i1 %14, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
+
+
+define void @sink_add_sub_unsinkable(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2, i32 %n) {
+; CHECK-LABEL: @sink_add_sub_unsinkable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP13]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N]], -4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+;
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat16
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
+  %5 = add nsw <4 x i32> %wide.load17, %2
+  %6 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %5, <4 x i32>* %6, align 4
+  %7 = getelementptr inbounds i32, i32* %s2, i32 %index
+  %8 = bitcast i32* %7 to <4 x i32>*
+  %wide.load18 = load <4 x i32>, <4 x i32>* %8, align 4
+  %9 = sub nsw <4 x i32> %broadcast.splat16, %wide.load18
+  %10 = getelementptr inbounds i32, i32* %d2, i32 %index
+  %11 = bitcast i32* %10 to <4 x i32>*
+  %wide.load19 = load <4 x i32>, <4 x i32>* %11, align 4
+  %12 = add nsw <4 x i32> %wide.load19, %9
+  %13 = bitcast i32* %10 to <4 x i32>*
+  store <4 x i32> %12, <4 x i32>* %13, align 4
+  %index.next = add i32 %index, 4
+  %14 = icmp eq i32 %index.next, %n.vec
+  br i1 %14, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
+
+define void @sink_sub(i32* %s1, i32 %x, i32* %d, i32 %n) {
+; CHECK-LABEL: @sink_sub(
+; CHECK:    vector.ph:
+; CHECK-NOT:  [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT:  [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:    vector.body:
+; CHECK:      [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK:      [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = sub nsw <4 x i32> %wide.load, %broadcast.splat9
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %4, align 4
+  %index.next = add i32 %index, 4
+  %5 = icmp eq i32 %index.next, %n.vec
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
+
+define void @sink_sub_unsinkable(i32* %s1, i32 %x, i32* %d, i32 %n) {
+entry:
+; CHECK-LABEL: @sink_sub_unsinkable(
+; CHECK:      vector.ph:
+; CHECK-NEXT:   [[N_VEC:%.*]] = and i32 [[N]], -4
+; CHECK-NEXT:   [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NEXT:   [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:   br label [[VECTOR_BODY:%.*]]
+; CHECK:      vector.body:
+; CHECK-NOT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i32 %n, -4
+  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = sub nsw <4 x i32> %broadcast.splat9, %wide.load
+  %3 = getelementptr inbounds i32, i32* %d, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %4, align 4
+  %index.next = add i32 %index, 4
+  %5 = icmp eq i32 %index.next, %n.vec
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+}
author	Sam Tebbs <sam.tebbs@arm.com>	2019-09-06 16:01:32 +0000
committer	Sam Tebbs <sam.tebbs@arm.com>	2019-09-06 16:01:32 +0000
commit	f1cdd95a2fe79fbcd7fd440509a754bc3afaf088 (patch)
tree	f36f31741b8883c8c19f035cdc26ee3c9f4ee927 /llvm/test/Transforms/CodeGenPrepare
parent	8d30c1dcec2a935e0b1cffc26fdc6054ff101f53 (diff)
download	bcm5719-llvm-f1cdd95a2fe79fbcd7fd440509a754bc3afaf088.tar.gz bcm5719-llvm-f1cdd95a2fe79fbcd7fd440509a754bc3afaf088.zip