summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
authorSam Tebbs <sam.tebbs@arm.com>2019-09-06 16:01:32 +0000
committerSam Tebbs <sam.tebbs@arm.com>2019-09-06 16:01:32 +0000
commitf1cdd95a2fe79fbcd7fd440509a754bc3afaf088 (patch)
treef36f31741b8883c8c19f035cdc26ee3c9f4ee927 /llvm
parent8d30c1dcec2a935e0b1cffc26fdc6054ff101f53 (diff)
downloadbcm5719-llvm-f1cdd95a2fe79fbcd7fd440509a754bc3afaf088.tar.gz
bcm5719-llvm-f1cdd95a2fe79fbcd7fd440509a754bc3afaf088.zip
[ARM] Sink add/mul(shufflevector(insertelement())) for MVE instruction selection
This patch sinks add/mul(shufflevector(insertelement())) into the basic block in which they are used so that they can then be selected together. This is useful for various MVE instructions, such as vmla and others that take R registers. Loop tests have been added to the vmla test file to make sure vmlas are generated in loops. Differential revision: https://reviews.llvm.org/D66295 llvm-svn: 371218
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp58
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vmla.ll122
-rw-r--r--llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll216
3 files changed, 386 insertions, 10 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index eacaa75bc10..24b7ebd2600 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -14340,22 +14340,60 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) {
/// sext/zext can be folded into vsubl.
bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
- if (!Subtarget->hasNEON() || !I->getType()->isVectorTy())
+ if (!I->getType()->isVectorTy())
return false;
- switch (I->getOpcode()) {
- case Instruction::Sub:
- case Instruction::Add: {
- if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ if (Subtarget->hasNEON()) {
+ switch (I->getOpcode()) {
+ case Instruction::Sub:
+ case Instruction::Add: {
+ if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ return false;
+ Ops.push_back(&I->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(1));
+ return true;
+ }
+ default:
return false;
- Ops.push_back(&I->getOperandUse(0));
- Ops.push_back(&I->getOperandUse(1));
- return true;
+ }
}
- default:
+
+ if (!Subtarget->hasMVEIntegerOps())
+ return false;
+
+ auto IsSinker = [](Instruction *I, int Operand) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Mul:
+ return true;
+ case Instruction::Sub:
+ return Operand == 1;
+ default:
+ return false;
+ }
+ };
+
+ int Op = 0;
+ if (!isa<ShuffleVectorInst>(I->getOperand(Op)))
+ Op = 1;
+ if (!IsSinker(I, Op))
+ return false;
+ if (!match(I->getOperand(Op),
+ m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
+ m_Undef(), m_Zero()))) {
return false;
}
- return false;
+ Instruction *Shuffle = cast<Instruction>(I->getOperand(Op));
+ // All uses of the shuffle should be sunk to avoid duplicating it across gpr
+ // and vector registers
+ for (Use &U : Shuffle->uses()) {
+ Instruction *Insn = cast<Instruction>(U.getUser());
+ if (!IsSinker(Insn, U.getOperandNo()))
+ return false;
+ }
+ Ops.push_back(&Shuffle->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(Op));
+ return true;
}
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmla.ll b/llvm/test/CodeGen/Thumb2/mve-vmla.ll
index 17f37ab6e55..b45efdf2edd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmla.ll
@@ -78,3 +78,125 @@ entry:
ret <16 x i8> %3
}
+define void @vmla32_in_loop(i32* %s1, i32 %x, i32* %d, i32 %n) {
+; CHECK-LABEL: vmla32_in_loop:
+; CHECK: .LBB6_1: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]!
+; CHECK-NEXT: vldrw.u32 q1, [r2, #16]!
+; CHECK-NEXT: vmla.u32 q1, q0, r1
+; CHECK-NEXT: vstrw.32 q1, [r2]
+; CHECK-NEXT: le lr, .LBB6_1
+; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph: ; preds = %for.body.preheader
+ %n.vec = and i32 %n, -4
+ %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
+ %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+ %1 = bitcast i32* %0 to <4 x i32>*
+ %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+ %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat9
+ %3 = getelementptr inbounds i32, i32* %d, i32 %index
+ %4 = bitcast i32* %3 to <4 x i32>*
+ %wide.load10 = load <4 x i32>, <4 x i32>* %4, align 4
+ %5 = add nsw <4 x i32> %wide.load10, %2
+ %6 = bitcast i32* %3 to <4 x i32>*
+ store <4 x i32> %5, <4 x i32>* %6, align 4
+ %index.next = add i32 %index, 4
+ %7 = icmp eq i32 %index.next, %n.vec
+ br i1 %7, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
+ ret void
+}
+
+define void @vmla16_in_loop(i16* %s1, i16 %x, i16* %d, i32 %n) {
+; CHECK-LABEL: vmla16_in_loop:
+; CHECK: .LBB7_1: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrh.u16 q0, [r0, #16]!
+; CHECK-NEXT: vldrh.u16 q1, [r2, #16]!
+; CHECK-NEXT: vmla.u16 q1, q0, r1
+; CHECK-NEXT: vstrh.16 q1, [r2]
+; CHECK-NEXT: le lr, .LBB7_1
+; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph: ; preds = %for.body.preheader
+ %n.vec = and i32 %n, -8
+ %broadcast.splatinsert11 = insertelement <8 x i16> undef, i16 %x, i32 0
+ %broadcast.splat12 = shufflevector <8 x i16> %broadcast.splatinsert11, <8 x i16> undef, <8 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i16, i16* %s1, i32 %index
+ %1 = bitcast i16* %0 to <8 x i16>*
+ %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
+ %2 = mul <8 x i16> %wide.load, %broadcast.splat12
+ %3 = getelementptr inbounds i16, i16* %d, i32 %index
+ %4 = bitcast i16* %3 to <8 x i16>*
+ %wide.load13 = load <8 x i16>, <8 x i16>* %4, align 2
+ %5 = add <8 x i16> %2, %wide.load13
+ %6 = bitcast i16* %3 to <8 x i16>*
+ store <8 x i16> %5, <8 x i16>* %6, align 2
+ %index.next = add i32 %index, 8
+ %7 = icmp eq i32 %index.next, %n.vec
+ br i1 %7, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
+ ret void
+}
+
+define void @vmla8_in_loop(i8* %s1, i8 %x, i8* %d, i32 %n) {
+; CHECK-LABEL: vmla8_in_loop:
+; CHECK: .LBB8_1: @ %vector.body
+; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vldrh.u16 q0, [r0, #8]!
+; CHECK-NEXT: vldrh.u16 q1, [r2, #8]!
+; CHECK-NEXT: vmla.u8 q1, q0, r1
+; CHECK-NEXT: vstrh.16 q1, [r2]
+; CHECK-NEXT: le lr, .LBB8_1
+; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph: ; preds = %for.body.preheader
+ %n.vec = and i32 %n, -8
+ %broadcast.splatinsert11 = insertelement <16 x i8> undef, i8 %x, i32 0
+ %broadcast.splat12 = shufflevector <16 x i8> %broadcast.splatinsert11, <16 x i8> undef, <16 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i8, i8* %s1, i32 %index
+ %1 = bitcast i8* %0 to <16 x i8>*
+ %wide.load = load <16 x i8>, <16 x i8>* %1, align 2
+ %2 = mul <16 x i8> %wide.load, %broadcast.splat12
+ %3 = getelementptr inbounds i8, i8* %d, i32 %index
+ %4 = bitcast i8* %3 to <16 x i8>*
+ %wide.load13 = load <16 x i8>, <16 x i8>* %4, align 2
+ %5 = add <16 x i8> %2, %wide.load13
+ %6 = bitcast i8* %3 to <16 x i8>*
+ store <16 x i8> %5, <16 x i8>* %6, align 2
+ %index.next = add i32 %index, 8
+ %7 = icmp eq i32 %index.next, %n.vec
+ br i1 %7, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
+ ret void
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll
new file mode 100644
index 00000000000..739877b3f0c
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-add-mul-shufflevector.ll
@@ -0,0 +1,216 @@
+; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp < %s -codegenprepare -S | FileCheck -check-prefix=CHECK %s
+
+define void @sink_add_mul(i32* %s1, i32 %x, i32* %d, i32 %n) {
+; CHECK-LABEL: @sink_add_mul(
+; CHECK: vector.ph:
+; CHECK-NOT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: vector.body:
+; CHECK: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph: ; preds = %for.body.preheader
+ %n.vec = and i32 %n, -4
+ %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
+ %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+ %1 = bitcast i32* %0 to <4 x i32>*
+ %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+ %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat9
+ %3 = getelementptr inbounds i32, i32* %d, i32 %index
+ %4 = bitcast i32* %3 to <4 x i32>*
+ %wide.load10 = load <4 x i32>, <4 x i32>* %4, align 4
+ %5 = add nsw <4 x i32> %wide.load10, %2
+ %6 = bitcast i32* %3 to <4 x i32>*
+ store <4 x i32> %5, <4 x i32>* %6, align 4
+ %index.next = add i32 %index, 4
+ %7 = icmp eq i32 %index.next, %n.vec
+ br i1 %7, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
+ ret void
+}
+
+define void @sink_add_mul_multiple(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2, i32 %n) {
+; CHECK-LABEL: @sink_add_mul_multiple(
+; CHECK: vector.ph:
+; CHECK-NOT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: vector.body:
+; CHECK: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+entry:
+ %cmp13 = icmp sgt i32 %n, 0
+ br i1 %cmp13, label %vector.ph, label %for.cond.cleanup
+
+vector.ph: ; preds = %for.body.preheader
+ %n.vec = and i32 %n, -4
+ %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %x, i32 0
+ %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+ %1 = bitcast i32* %0 to <4 x i32>*
+ %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+ %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat16
+ %3 = getelementptr inbounds i32, i32* %d, i32 %index
+ %4 = bitcast i32* %3 to <4 x i32>*
+ %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
+ %5 = add nsw <4 x i32> %wide.load17, %2
+ %6 = bitcast i32* %3 to <4 x i32>*
+ store <4 x i32> %5, <4 x i32>* %6, align 4
+ %7 = getelementptr inbounds i32, i32* %s2, i32 %index
+ %8 = bitcast i32* %7 to <4 x i32>*
+ %wide.load18 = load <4 x i32>, <4 x i32>* %8, align 4
+ %9 = mul nsw <4 x i32> %wide.load18, %broadcast.splat16
+ %10 = getelementptr inbounds i32, i32* %d2, i32 %index
+ %11 = bitcast i32* %10 to <4 x i32>*
+ %wide.load19 = load <4 x i32>, <4 x i32>* %11, align 4
+ %12 = add nsw <4 x i32> %wide.load19, %9
+ %13 = bitcast i32* %10 to <4 x i32>*
+ store <4 x i32> %12, <4 x i32>* %13, align 4
+ %index.next = add i32 %index, 4
+ %14 = icmp eq i32 %index.next, %n.vec
+ br i1 %14, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
+ ret void
+}
+
+
+define void @sink_add_sub_unsinkable(i32* %s1, i32* %s2, i32 %x, i32* %d, i32* %d2, i32 %n) {
+; CHECK-LABEL: @sink_add_sub_unsinkable(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT: br i1 [[CMP13]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+;
+entry:
+ %cmp13 = icmp sgt i32 %n, 0
+ br i1 %cmp13, label %vector.ph, label %for.cond.cleanup
+
+vector.ph: ; preds = %for.body.preheader
+ %n.vec = and i32 %n, -4
+ %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %x, i32 0
+ %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+ %1 = bitcast i32* %0 to <4 x i32>*
+ %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+ %2 = mul nsw <4 x i32> %wide.load, %broadcast.splat16
+ %3 = getelementptr inbounds i32, i32* %d, i32 %index
+ %4 = bitcast i32* %3 to <4 x i32>*
+ %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
+ %5 = add nsw <4 x i32> %wide.load17, %2
+ %6 = bitcast i32* %3 to <4 x i32>*
+ store <4 x i32> %5, <4 x i32>* %6, align 4
+ %7 = getelementptr inbounds i32, i32* %s2, i32 %index
+ %8 = bitcast i32* %7 to <4 x i32>*
+ %wide.load18 = load <4 x i32>, <4 x i32>* %8, align 4
+ %9 = sub nsw <4 x i32> %broadcast.splat16, %wide.load18
+ %10 = getelementptr inbounds i32, i32* %d2, i32 %index
+ %11 = bitcast i32* %10 to <4 x i32>*
+ %wide.load19 = load <4 x i32>, <4 x i32>* %11, align 4
+ %12 = add nsw <4 x i32> %wide.load19, %9
+ %13 = bitcast i32* %10 to <4 x i32>*
+ store <4 x i32> %12, <4 x i32>* %13, align 4
+ %index.next = add i32 %index, 4
+ %14 = icmp eq i32 %index.next, %n.vec
+ br i1 %14, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
+ ret void
+}
+
+define void @sink_sub(i32* %s1, i32 %x, i32* %d, i32 %n) {
+; CHECK-LABEL: @sink_sub(
+; CHECK: vector.ph:
+; CHECK-NOT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: vector.body:
+; CHECK: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+entry:
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph: ; preds = %for.body.preheader
+ %n.vec = and i32 %n, -4
+ %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
+ %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+ %1 = bitcast i32* %0 to <4 x i32>*
+ %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+ %2 = sub nsw <4 x i32> %wide.load, %broadcast.splat9
+ %3 = getelementptr inbounds i32, i32* %d, i32 %index
+ %4 = bitcast i32* %3 to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %4, align 4
+ %index.next = add i32 %index, 4
+ %5 = icmp eq i32 %index.next, %n.vec
+ br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
+ ret void
+}
+
+define void @sink_sub_unsinkable(i32* %s1, i32 %x, i32* %d, i32 %n) {
+entry:
+; CHECK-LABEL: @sink_sub_unsinkable(
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NOT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
+; CHECK-NOT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+;
+ %cmp6 = icmp sgt i32 %n, 0
+ br i1 %cmp6, label %vector.ph, label %for.cond.cleanup
+
+vector.ph: ; preds = %for.body.preheader
+ %n.vec = and i32 %n, -4
+ %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %x, i32 0
+ %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %0 = getelementptr inbounds i32, i32* %s1, i32 %index
+ %1 = bitcast i32* %0 to <4 x i32>*
+ %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+ %2 = sub nsw <4 x i32> %broadcast.splat9, %wide.load
+ %3 = getelementptr inbounds i32, i32* %d, i32 %index
+ %4 = bitcast i32* %3 to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %4, align 4
+ %index.next = add i32 %index, 4
+ %5 = icmp eq i32 %index.next, %n.vec
+ br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
+ ret void
+}
OpenPOWER on IntegriCloud