Improve ARM lowering for "icmp <2 x i64> eq".

The custom lowering is pretty straightforward: basically, just AND together the two halves of a <4 x i32> compare. Differential Revision: https://reviews.llvm.org/D25713 llvm-svn: 284536
author: Eli Friedman <efriedma@codeaurora.org> 2016-10-18 21:03:40 +0000
committer: Eli Friedman <efriedma@codeaurora.org> 2016-10-18 21:03:40 +0000
commit: c0a717ba5bf41ad90ba1ad8151f2b15b79c65998 (patch)
tree: c8701a348c439c2afc62edc309a4a3fdcf791df6 /llvm
parent: 36efa68463bc380c492cd08ae614c04ba3cd9fd7 (diff)
download: bcm5719-llvm-c0a717ba5bf41ad90ba1ad8151f2b15b79c65998.tar.gz
bcm5719-llvm-c0a717ba5bf41ad90ba1ad8151f2b15b79c65998.zip
2 files changed, 73 insertions, 6 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a41c4fcb9cd..dcbf3c5e513 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -278,7 +278,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
   }
 
   MVT ElemTy = VT.getVectorElementType();
-  if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
+  if (ElemTy != MVT::f64)
     setOperationAction(ISD::SETCC, VT, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -742,8 +742,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
-    setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
-    setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
     // a destination type that is wider than the source, and nor does
     // it have a FP_TO_[SU]INT instruction with a narrower destination than
@@ -5242,10 +5240,27 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   SDLoc dl(Op);
 
+  if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
+      (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
+    // Special-case integer 64-bit equality comparisons. They aren't legal,
+    // but they can be lowered with a few vector instructions.
+    unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
+    EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
+    SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
+    SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
+    SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
+                              DAG.getCondCode(ISD::SETEQ));
+    SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
+    SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
+    Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
+    if (SetCCOpcode == ISD::SETNE)
+      Merged = DAG.getNOT(dl, Merged, CmpVT);
+    Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
+    return Merged;
+  }
+
   if (CmpVT.getVectorElementType() == MVT::i64)
-    // 64-bit comparisons are not legal. We've marked SETCC as non-Custom,
-    // but it's possible that our operands are 64-bit but our result is 32-bit.
-    // Bail in this case.
+    // 64-bit comparisons are not legal in general.
     return SDValue();
 
   if (Op1.getValueType().isFloatingPoint()) {
diff --git a/llvm/test/CodeGen/ARM/vicmp-64.ll b/llvm/test/CodeGen/ARM/vicmp-64.ll
new file mode 100644
index 00000000000..57e036bde22
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/vicmp-64.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=arm -mattr=+neon %s -o - | FileCheck %s
+
+; Check codegen for 64-bit icmp operations, which don't directly map to any
+; instruction.
+
+define <2 x i64> @vne(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vne:
+;CHECK: vceq.i32
+;CHECK-NEXT: vrev64.32
+;CHECK-NEXT: vand
+;CHECK-NEXT: vmvn
+;CHECK-NEXT: vmov
+;CHECK-NEXT: vmov
+;CHECK-NEXT: mov pc, lr
+      %tmp1 = load <2 x i64>, <2 x i64>* %A
+      %tmp2 = load <2 x i64>, <2 x i64>* %B
+      %tmp3 = icmp ne <2 x i64> %tmp1, %tmp2
+      %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+      ret <2 x i64> %tmp4
+}
+
+define <2 x i64> @veq(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: veq:
+;CHECK: vceq.i32
+;CHECK-NEXT: vrev64.32
+;CHECK-NEXT: vand
+;CHECK-NEXT: vmov
+;CHECK-NEXT: vmov
+;CHECK-NEXT: mov pc, lr
+    %tmp1 = load <2 x i64>, <2 x i64>* %A
+    %tmp2 = load <2 x i64>, <2 x i64>* %B
+    %tmp3 = icmp eq <2 x i64> %tmp1, %tmp2
+    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+    ret <2 x i64> %tmp4
+}
+
+; FIXME: We currently generate terrible code for this.
+; (Atop < Btop) | ((ATop == BTop) & (ABottom < BBottom))
+; would come out to roughly 6 instructions, but we currently
+; scalarize it.
+define <2 x i64> @vult(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vult:
+;CHECK: subs
+;CHECK: sbcs
+;CHECK: subs
+;CHECK: sbcs
+    %tmp1 = load <2 x i64>, <2 x i64>* %A
+    %tmp2 = load <2 x i64>, <2 x i64>* %B
+    %tmp3 = icmp ult <2 x i64> %tmp1, %tmp2
+    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+    ret <2 x i64> %tmp4
+}
author	Eli Friedman <efriedma@codeaurora.org>	2016-10-18 21:03:40 +0000
committer	Eli Friedman <efriedma@codeaurora.org>	2016-10-18 21:03:40 +0000
commit	c0a717ba5bf41ad90ba1ad8151f2b15b79c65998 (patch)
tree	c8701a348c439c2afc62edc309a4a3fdcf791df6 /llvm
parent	36efa68463bc380c492cd08ae614c04ba3cd9fd7 (diff)
download	bcm5719-llvm-c0a717ba5bf41ad90ba1ad8151f2b15b79c65998.tar.gz bcm5719-llvm-c0a717ba5bf41ad90ba1ad8151f2b15b79c65998.zip