diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 55 | ||||
| -rw-r--r-- | llvm/test/CodeGen/ARM/combine-vmovdrr.ll | 72 |
2 files changed, 127 insertions, 0 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 33f74a3ba9f..23f7bd0f4c8 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -4139,6 +4139,56 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, Results.push_back(Read.getOperand(0)); } +/// \p BC is a bitcast that is about to be turned into a VMOVDRR. +/// When \p DstVT, the destination type of \p BC, is on the vector +/// register bank and the source of bitcast, \p Op, operates on the same bank, +/// it might be possible to combine them, such that everything stays on the +/// vector register bank. +/// \p return The node that would replace \p BT, if the combine +/// is possible. +static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, + SelectionDAG &DAG) { + SDValue Op = BC->getOperand(0); + EVT DstVT = BC->getValueType(0); + + // The only vector instruction that can produce a scalar (remember, + // since the bitcast was about to be turned into VMOVDRR, the source + // type is i64) from a vector is EXTRACT_VECTOR_ELT. + // Moreover, we can do this combine only if there is one use. + // Finally, if the destination type is not a vector, there is not + // much point on forcing everything on the vector bank. + if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !Op.hasOneUse()) + return SDValue(); + + // If the index is not constant, we will introduce an additional + // multiply that will stick. + // Give up in that case. + ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Index) + return SDValue(); + unsigned DstNumElt = DstVT.getVectorNumElements(); + + // Compute the new index. + const APInt &APIntIndex = Index->getAPIntValue(); + APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); + NewIndex *= APIntIndex; + // Check if the new constant index fits into i32. + if (NewIndex.getBitWidth() > 32) + return SDValue(); + + // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> + // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) + SDLoc dl(Op); + SDValue ExtractSrc = Op.getOperand(0); + EVT VecVT = EVT::getVectorVT( + *DAG.getContext(), DstVT.getScalarType(), + ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); + SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, + DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); +} + /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 @@ -4158,6 +4208,11 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { + // Do not force values to GPRs (this is what VMOVDRR does for the inputs) + // if we can combine the bitcast with its source. + if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) + return Val; + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, diff --git a/llvm/test/CodeGen/ARM/combine-vmovdrr.ll b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll new file mode 100644 index 00000000000..358f7e3a983 --- /dev/null +++ b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll @@ -0,0 +1,72 @@ +; RUN: llc %s -o - | FileCheck %s + +target triple = "thumbv7s-apple-ios" + +declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %shuffle.i.i307, <8 x i8> %shuffle.i27.i308, <8 x i8> %vtbl2.i25.i) + +; Check that we get the motivating example: +; The bitcasts force the values to go through the GPRs, whereas +; they are defined on VPRs and used on VPRs. +; +; CHECK-LABEL: motivatingExample: +; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1] +; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0] +; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]] +; CHECK-NEXT: vstr [[RES]], [r1] +; CHECK-NEXT: bx lr +define void @motivatingExample(<2 x i64>* %addr, <8 x i8>* %addr2) { + %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr + %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2 + %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0 + %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1 + %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8> + %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8> + %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i) + store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2 + ret void +} + +; Check that we do not perform the transformation for dynamic index. +; CHECK-LABEL: dynamicIndex: +; CHECK-NOT: mul +; CHECK: pop +define void @dynamicIndex(<2 x i64>* %addr, <8 x i8>* %addr2, i32 %index) { + %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr + %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2 + %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 %index + %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1 + %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8> + %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8> + %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i) + store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2 + ret void +} + +; Check that we do not perform the transformation when there are several uses +; of the result of the bitcast. +; CHECK-LABEL: severalUses: +; ARG1_VALlo is hard coded because we need to access the high part of d0, +; i.e., s1, and we can't express that with filecheck. +; CHECK: vld1.32 {[[ARG1_VALlo:d0]], [[ARG1_VALhi:d[0-9]+]]}, [r0] +; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1] +; s1 is actually 2 * ARG1_VALlo + 1, but we cannot express that with filecheck. +; CHECK-NEXT: vmov [[REThi:r[0-9]+]], s1 +; We build the return value here. s0 is 2 * ARG1_VALlo. +; CHECK-NEXT: vmov r0, s0 +; This copy is correct but actually useless. We should be able to clean it up. +; CHECK-NEXT: vmov [[ARG1_VALloCPY:d[0-9]+]], r0, [[REThi]] +; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALloCPY]], [[ARG1_VALhi]]}, [[ARG2_VAL]] +; CHECK-NEXT: vstr [[RES]], [r1] +; CHECK-NEXT: mov r1, [[REThi]] +; CHECK-NEXT: bx lr +define i64 @severalUses(<2 x i64>* %addr, <8 x i8>* %addr2) { + %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr + %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2 + %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0 + %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1 + %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8> + %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8> + %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i) + store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2 + ret i64 %shuffle.i.extract.i310 +} |

