diff options
-rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 30 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMInstrMVE.td | 1 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll | 422 | ||||
-rw-r--r-- | llvm/test/CodeGen/Thumb2/mve-shifts.ll | 30 |
4 files changed, 21 insertions, 462 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 6aada18d555..c08f748c129 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -11814,8 +11814,7 @@ static SDValue PerformADDCombine(SDNode *N, /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. /// static SDValue PerformSUBCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -11824,27 +11823,7 @@ static SDValue PerformSUBCombine(SDNode *N, if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) return Result; - if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) - return SDValue(); - - // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) - // so that we can readily pattern match more mve instructions which can use - // a scalar operand. - SDValue VDup = N->getOperand(1); - if (VDup->getOpcode() != ARMISD::VDUP) - return SDValue(); - - SDValue VMov = N->getOperand(0); - if (VMov->getOpcode() == ISD::BITCAST) - VMov = VMov->getOperand(0); - - if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) - return SDValue(); - - SDLoc dl(N); - SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, VMov->getOperand(0), - VDup->getOperand(0)); - return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); + return SDValue(); } /// PerformVMULCombine @@ -14529,7 +14508,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); - case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); + case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); @@ -14887,9 +14866,6 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I, case Instruction::Mul: return true; case Instruction::Sub: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: return Operand == 1; default: return false; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 35495a121c1..2a063e4ebde 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4304,7 +4304,6 @@ defm MVE_VRSHL_qr : MVE_VxSHL_qr_types<"vrshl", 0b0, 0b1>; defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>; defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>; - let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>; diff --git a/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll b/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll deleted file mode 100644 index d9ab11ef81c..00000000000 --- a/llvm/test/CodeGen/Thumb2/mve-shifts-scalar.ll +++ /dev/null @@ -1,422 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -O3 -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s - -define dso_local arm_aapcs_vfpcc void @sink_shl_i32(i32* nocapture readonly %in, i32* noalias nocapture %out, i32 %shift, i32 %N) { -; CHECK-LABEL: sink_shl_i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #16 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB0_1: @ %vector.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! -; CHECK-NEXT: vshl.u32 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #16]! -; CHECK-NEXT: le lr, .LBB0_1 -; CHECK-NEXT: @ %bb.2: @ %exit -; CHECK-NEXT: pop {r7, pc} -entry: - br label %vector.ph - -vector.ph: - %n.vec = and i32 %N, -4 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %gep.in = getelementptr inbounds i32, i32* %in, i32 %index - %cast.in = bitcast i32* %gep.in to <4 x i32>* - %wide.load = load <4 x i32>, <4 x i32>* %cast.in, align 4 - %res = shl <4 x i32> %wide.load, %broadcast.splat11 - %gep.out = getelementptr inbounds i32, i32* %out, i32 %index - %cast.out = bitcast i32* %gep.out to <4 x i32>* - store <4 x i32> %res, <4 x i32>* %cast.out, align 4 - %index.next = add i32 %index, 4 - %cmp = icmp eq i32 %index.next, %n.vec - br i1 %cmp, label %exit, label %vector.body - -exit: - ret void -} - -define dso_local arm_aapcs_vfpcc void @sink_shl_i16(i16* nocapture readonly %in, i16* noalias nocapture %out, i16 %shift, i32 %N) { -; CHECK-LABEL: sink_shl_i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #8 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB1_1: @ %vector.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #8]! -; CHECK-NEXT: vshl.u16 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #8]! -; CHECK-NEXT: le lr, .LBB1_1 -; CHECK-NEXT: @ %bb.2: @ %exit -; CHECK-NEXT: pop {r7, pc} -entry: - br label %vector.ph - -vector.ph: - %n.vec = and i32 %N, -4 - %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0 - %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %gep.in = getelementptr inbounds i16, i16* %in, i32 %index - %cast.in = bitcast i16* %gep.in to <8 x i16>* - %wide.load = load <8 x i16>, <8 x i16>* %cast.in, align 4 - %res = shl <8 x i16> %wide.load, %broadcast.splat11 - %gep.out = getelementptr inbounds i16, i16* %out, i32 %index - %cast.out = bitcast i16* %gep.out to <8 x i16>* - store <8 x i16> %res, <8 x i16>* %cast.out, align 4 - %index.next = add i32 %index, 4 - %cmp = icmp eq i32 %index.next, %n.vec - br i1 %cmp, label %exit, label %vector.body - -exit: - ret void -} - -define dso_local arm_aapcs_vfpcc void @sink_shl_i8(i8* nocapture readonly %in, i8* noalias nocapture %out, i8 %shift, i32 %N) { -; CHECK-LABEL: sink_shl_i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #4 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB2_1: @ %vector.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #4]! -; CHECK-NEXT: vshl.u8 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #4]! -; CHECK-NEXT: le lr, .LBB2_1 -; CHECK-NEXT: @ %bb.2: @ %exit -; CHECK-NEXT: pop {r7, pc} -entry: - br label %vector.ph - -vector.ph: - %n.vec = and i32 %N, -4 - %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0 - %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %gep.in = getelementptr inbounds i8, i8* %in, i32 %index - %cast.in = bitcast i8* %gep.in to <16 x i8>* - %wide.load = load <16 x i8>, <16 x i8>* %cast.in, align 4 - %res = shl <16 x i8> %wide.load, %broadcast.splat11 - %gep.out = getelementptr inbounds i8, i8* %out, i32 %index - %cast.out = bitcast i8* %gep.out to <16 x i8>* - store <16 x i8> %res, <16 x i8>* %cast.out, align 4 - %index.next = add i32 %index, 4 - %cmp = icmp eq i32 %index.next, %n.vec - br i1 %cmp, label %exit, label %vector.body - -exit: - ret void -} - -define dso_local arm_aapcs_vfpcc void @sink_lshr_i32(i32* nocapture readonly %in, i32* noalias nocapture %out, i32 %shift, i32 %N) { -; CHECK-LABEL: sink_lshr_i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #16 -; CHECK-NEXT: subs r2, #0, r2 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB3_1: @ %vector.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! -; CHECK-NEXT: vshl.u32 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #16]! -; CHECK-NEXT: le lr, .LBB3_1 -; CHECK-NEXT: @ %bb.2: @ %exit -; CHECK-NEXT: pop {r7, pc} -entry: - br label %vector.ph - -vector.ph: - %n.vec = and i32 %N, -4 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %gep.in = getelementptr inbounds i32, i32* %in, i32 %index - %cast.in = bitcast i32* %gep.in to <4 x i32>* - %wide.load = load <4 x i32>, <4 x i32>* %cast.in, align 4 - %res = lshr <4 x i32> %wide.load, %broadcast.splat11 - %gep.out = getelementptr inbounds i32, i32* %out, i32 %index - %cast.out = bitcast i32* %gep.out to <4 x i32>* - store <4 x i32> %res, <4 x i32>* %cast.out, align 4 - %index.next = add i32 %index, 4 - %cmp = icmp eq i32 %index.next, %n.vec - br i1 %cmp, label %exit, label %vector.body - -exit: - ret void -} - -define dso_local arm_aapcs_vfpcc void @sink_lshr_i16(i16* nocapture readonly %in, i16* noalias nocapture %out, i16 %shift, i32 %N) { -; CHECK-LABEL: sink_lshr_i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #8 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: subs r2, #0, r2 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB4_1: @ %vector.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #8]! -; CHECK-NEXT: vshl.u16 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #8]! -; CHECK-NEXT: le lr, .LBB4_1 -; CHECK-NEXT: @ %bb.2: @ %exit -; CHECK-NEXT: pop {r7, pc} -entry: - br label %vector.ph - -vector.ph: - %n.vec = and i32 %N, -4 - %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0 - %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %gep.in = getelementptr inbounds i16, i16* %in, i32 %index - %cast.in = bitcast i16* %gep.in to <8 x i16>* - %wide.load = load <8 x i16>, <8 x i16>* %cast.in, align 4 - %res = lshr <8 x i16> %wide.load, %broadcast.splat11 - %gep.out = getelementptr inbounds i16, i16* %out, i32 %index - %cast.out = bitcast i16* %gep.out to <8 x i16>* - store <8 x i16> %res, <8 x i16>* %cast.out, align 4 - %index.next = add i32 %index, 4 - %cmp = icmp eq i32 %index.next, %n.vec - br i1 %cmp, label %exit, label %vector.body - -exit: - ret void -} - -define dso_local arm_aapcs_vfpcc void @sink_lshr_i8(i8* nocapture readonly %in, i8* noalias nocapture %out, i8 %shift, i32 %N) { -; CHECK-LABEL: sink_lshr_i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #4 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: subs r2, #0, r2 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB5_1: @ %vector.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #4]! -; CHECK-NEXT: vshl.u8 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #4]! -; CHECK-NEXT: le lr, .LBB5_1 -; CHECK-NEXT: @ %bb.2: @ %exit -; CHECK-NEXT: pop {r7, pc} -entry: - br label %vector.ph - -vector.ph: - %n.vec = and i32 %N, -4 - %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0 - %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %gep.in = getelementptr inbounds i8, i8* %in, i32 %index - %cast.in = bitcast i8* %gep.in to <16 x i8>* - %wide.load = load <16 x i8>, <16 x i8>* %cast.in, align 4 - %res = lshr <16 x i8> %wide.load, %broadcast.splat11 - %gep.out = getelementptr inbounds i8, i8* %out, i32 %index - %cast.out = bitcast i8* %gep.out to <16 x i8>* - store <16 x i8> %res, <16 x i8>* %cast.out, align 4 - %index.next = add i32 %index, 4 - %cmp = icmp eq i32 %index.next, %n.vec - br i1 %cmp, label %exit, label %vector.body - -exit: - ret void -} - -define dso_local arm_aapcs_vfpcc void @sink_ashr_i32(i32* nocapture readonly %in, i32* noalias nocapture %out, i32 %shift, i32 %N) { -; CHECK-LABEL: sink_ashr_i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #16 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #16 -; CHECK-NEXT: subs r2, #0, r2 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB6_1: @ %vector.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #16]! -; CHECK-NEXT: vshl.s32 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #16]! -; CHECK-NEXT: le lr, .LBB6_1 -; CHECK-NEXT: @ %bb.2: @ %exit -; CHECK-NEXT: pop {r7, pc} -entry: - br label %vector.ph - -vector.ph: - %n.vec = and i32 %N, -4 - %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0 - %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %gep.in = getelementptr inbounds i32, i32* %in, i32 %index - %cast.in = bitcast i32* %gep.in to <4 x i32>* - %wide.load = load <4 x i32>, <4 x i32>* %cast.in, align 4 - %res = ashr <4 x i32> %wide.load, %broadcast.splat11 - %gep.out = getelementptr inbounds i32, i32* %out, i32 %index - %cast.out = bitcast i32* %gep.out to <4 x i32>* - store <4 x i32> %res, <4 x i32>* %cast.out, align 4 - %index.next = add i32 %index, 4 - %cmp = icmp eq i32 %index.next, %n.vec - br i1 %cmp, label %exit, label %vector.body - -exit: - ret void -} - -define dso_local arm_aapcs_vfpcc void @sink_ashr_i16(i16* nocapture readonly %in, i16* noalias nocapture %out, i16 %shift, i32 %N) { -; CHECK-LABEL: sink_ashr_i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #8 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: subs r2, #0, r2 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB7_1: @ %vector.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #8]! -; CHECK-NEXT: vshl.s16 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #8]! -; CHECK-NEXT: le lr, .LBB7_1 -; CHECK-NEXT: @ %bb.2: @ %exit -; CHECK-NEXT: pop {r7, pc} -entry: - br label %vector.ph - -vector.ph: - %n.vec = and i32 %N, -4 - %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0 - %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %gep.in = getelementptr inbounds i16, i16* %in, i32 %index - %cast.in = bitcast i16* %gep.in to <8 x i16>* - %wide.load = load <8 x i16>, <8 x i16>* %cast.in, align 4 - %res = ashr <8 x i16> %wide.load, %broadcast.splat11 - %gep.out = getelementptr inbounds i16, i16* %out, i32 %index - %cast.out = bitcast i16* %gep.out to <8 x i16>* - store <8 x i16> %res, <8 x i16>* %cast.out, align 4 - %index.next = add i32 %index, 4 - %cmp = icmp eq i32 %index.next, %n.vec - br i1 %cmp, label %exit, label %vector.body - -exit: - ret void -} - -define dso_local arm_aapcs_vfpcc void @sink_ashr_i8(i8* nocapture readonly %in, i8* noalias nocapture %out, i8 %shift, i32 %N) { -; CHECK-LABEL: sink_ashr_i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r0, #4 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: subs r2, #0, r2 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB8_1: @ %vector.body -; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0, #4]! -; CHECK-NEXT: vshl.s8 q0, r2 -; CHECK-NEXT: vstrb.8 q0, [r1, #4]! -; CHECK-NEXT: le lr, .LBB8_1 -; CHECK-NEXT: @ %bb.2: @ %exit -; CHECK-NEXT: pop {r7, pc} -entry: - br label %vector.ph - -vector.ph: - %n.vec = and i32 %N, -4 - %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0 - %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %gep.in = getelementptr inbounds i8, i8* %in, i32 %index - %cast.in = bitcast i8* %gep.in to <16 x i8>* - %wide.load = load <16 x i8>, <16 x i8>* %cast.in, align 4 - %res = ashr <16 x i8> %wide.load, %broadcast.splat11 - %gep.out = getelementptr inbounds i8, i8* %out, i32 %index - %cast.out = bitcast i8* %gep.out to <16 x i8>* - store <16 x i8> %res, <16 x i8>* %cast.out, align 4 - %index.next = add i32 %index, 4 - %cmp = icmp eq i32 %index.next, %n.vec - br i1 %cmp, label %exit, label %vector.body - -exit: - ret void -} diff --git a/llvm/test/CodeGen/Thumb2/mve-shifts.ll b/llvm/test/CodeGen/Thumb2/mve-shifts.ll index 688fd455d78..a321c2dd383 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shifts.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shifts.ll @@ -383,8 +383,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shru_qr_int8_t(<16 x i8> %src1, i8 %src2) { ; CHECK-LABEL: shru_qr_int8_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #0, r0 -; CHECK-NEXT: vshl.u8 q0, r0 +; CHECK-NEXT: vdup.8 q1, r0 +; CHECK-NEXT: vneg.s8 q1, q1 +; CHECK-NEXT: vshl.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <16 x i8> undef, i8 %src2, i32 0 @@ -396,8 +397,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shru_qr_int16_t(<8 x i16> %src1, i16 %src2) { ; CHECK-LABEL: shru_qr_int16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #0, r0 -; CHECK-NEXT: vshl.u16 q0, r0 +; CHECK-NEXT: vdup.16 q1, r0 +; CHECK-NEXT: vneg.s16 q1, q1 +; CHECK-NEXT: vshl.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <8 x i16> undef, i16 %src2, i32 0 @@ -409,8 +411,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @shru_qr_int32_t(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: shru_qr_int32_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #0, r0 -; CHECK-NEXT: vshl.u32 q0, r0 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vneg.s32 q1, q1 +; CHECK-NEXT: vshl.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <4 x i32> undef, i32 %src2, i32 0 @@ -446,8 +449,9 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shrs_qr_int8_t(<16 x i8> %src1, i8 %src2) { ; CHECK-LABEL: shrs_qr_int8_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #0, r0 -; CHECK-NEXT: vshl.s8 q0, r0 +; CHECK-NEXT: vdup.8 q1, r0 +; CHECK-NEXT: vneg.s8 q1, q1 +; CHECK-NEXT: vshl.s8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <16 x i8> undef, i8 %src2, i32 0 @@ -459,8 +463,9 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shrs_qr_int16_t(<8 x i16> %src1, i16 %src2) { ; CHECK-LABEL: shrs_qr_int16_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #0, r0 -; CHECK-NEXT: vshl.s16 q0, r0 +; CHECK-NEXT: vdup.16 q1, r0 +; CHECK-NEXT: vneg.s16 q1, q1 +; CHECK-NEXT: vshl.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <8 x i16> undef, i16 %src2, i32 0 @@ -472,8 +477,9 @@ entry: define arm_aapcs_vfpcc <4 x i32> @shrs_qr_int32_t(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: shrs_qr_int32_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: subs r0, #0, r0 -; CHECK-NEXT: vshl.s32 q0, r0 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vneg.s32 q1, q1 +; CHECK-NEXT: vshl.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %i = insertelement <4 x i32> undef, i32 %src2, i32 0 |