2 files changed, 127 insertions, 0 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 33f74a3ba9f..23f7bd0f4c8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -4139,6 +4139,56 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
   Results.push_back(Read.getOperand(0));
 }
 
+/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
+/// When \p DstVT, the destination type of \p BC, is on the vector
+/// register bank and the source of bitcast, \p Op, operates on the same bank,
+/// it might be possible to combine them, such that everything stays on the
+/// vector register bank.
+/// \p return The node that would replace \p BT, if the combine
+/// is possible.
+static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
+                                                SelectionDAG &DAG) {
+  SDValue Op = BC->getOperand(0);
+  EVT DstVT = BC->getValueType(0);
+
+  // The only vector instruction that can produce a scalar (remember,
+  // since the bitcast was about to be turned into VMOVDRR, the source
+  // type is i64) from a vector is EXTRACT_VECTOR_ELT.
+  // Moreover, we can do this combine only if there is one use.
+  // Finally, if the destination type is not a vector, there is not
+  // much point on forcing everything on the vector bank.
+  if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !Op.hasOneUse())
+    return SDValue();
+
+  // If the index is not constant, we will introduce an additional
+  // multiply that will stick.
+  // Give up in that case.
+  ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Index)
+    return SDValue();
+  unsigned DstNumElt = DstVT.getVectorNumElements();
+
+  // Compute the new index.
+  const APInt &APIntIndex = Index->getAPIntValue();
+  APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
+  NewIndex *= APIntIndex;
+  // Check if the new constant index fits into i32.
+  if (NewIndex.getBitWidth() > 32)
+    return SDValue();
+
+  // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
+  // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
+  SDLoc dl(Op);
+  SDValue ExtractSrc = Op.getOperand(0);
+  EVT VecVT = EVT::getVectorVT(
+      *DAG.getContext(), DstVT.getScalarType(),
+      ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
+  SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
+                     DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
+}
+
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -4158,6 +4208,11 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+    // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
+    // if we can combine the bitcast with its source.
+    if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
+      return Val;
+
     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                              DAG.getConstant(0, dl, MVT::i32));
     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
diff --git a/llvm/test/CodeGen/ARM/combine-vmovdrr.ll b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
new file mode 100644
index 00000000000..358f7e3a983
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/combine-vmovdrr.ll
@@ -0,0 +1,72 @@
+; RUN: llc %s -o - | FileCheck %s
+
+target triple = "thumbv7s-apple-ios"
+
+declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %shuffle.i.i307, <8 x i8> %shuffle.i27.i308, <8 x i8> %vtbl2.i25.i)
+
+; Check that we get the motivating example:
+; The bitcasts force the values to go through the GPRs, whereas
+; they are defined on VPRs and used on VPRs.
+;
+; CHECK-LABEL: motivatingExample:
+; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1]
+; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
+; CHECK-NEXT: vstr [[RES]], [r1]
+; CHECK-NEXT: bx lr
+define void @motivatingExample(<2 x i64>* %addr, <8 x i8>* %addr2) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret void
+}
+
+; Check that we do not perform the transformation for dynamic index.
+; CHECK-LABEL: dynamicIndex:
+; CHECK-NOT: mul
+; CHECK: pop
+define void @dynamicIndex(<2 x i64>* %addr, <8 x i8>* %addr2, i32 %index) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 %index
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret void
+}
+
+; Check that we do not perform the transformation when there are several uses
+; of the result of the bitcast.
+; CHECK-LABEL: severalUses:
+; ARG1_VALlo is hard coded because we need to access the high part of d0,
+; i.e., s1, and we can't express that with filecheck.
+; CHECK: vld1.32 {[[ARG1_VALlo:d0]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1]
+; s1 is actually 2 * ARG1_VALlo + 1, but we cannot express that with filecheck.
+; CHECK-NEXT: vmov [[REThi:r[0-9]+]], s1
+; We build the return value here. s0 is 2 * ARG1_VALlo.
+; CHECK-NEXT: vmov r0, s0
+; This copy is correct but actually useless. We should be able to clean it up.
+; CHECK-NEXT: vmov [[ARG1_VALloCPY:d[0-9]+]], r0, [[REThi]]
+; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALloCPY]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
+; CHECK-NEXT: vstr [[RES]], [r1]
+; CHECK-NEXT: mov r1, [[REThi]]
+; CHECK-NEXT: bx lr
+define i64 @severalUses(<2 x i64>* %addr, <8 x i8>* %addr2) {
+  %shuffle.i.bc.i309 = load <2 x i64>, <2 x i64>* %addr
+  %vtbl2.i25.i = load <8 x i8>, <8 x i8>* %addr2
+  %shuffle.i.extract.i310 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 0
+  %shuffle.i27.extract.i311 = extractelement <2 x i64> %shuffle.i.bc.i309, i32 1
+  %tmp45 = bitcast i64 %shuffle.i.extract.i310 to <8 x i8>
+  %tmp46 = bitcast i64 %shuffle.i27.extract.i311 to <8 x i8>
+  %vtbl2.i25.i313 = tail call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %tmp45, <8 x i8> %tmp46, <8 x i8> %vtbl2.i25.i)
+  store <8 x i8> %vtbl2.i25.i313, <8 x i8>* %addr2
+  ret i64 %shuffle.i.extract.i310
+}