[x86] use psubus for more vsetcc lowering (PR39859)

Circling back to a leftover bit from PR39859: https://bugs.llvm.org/show_bug.cgi?id=39859#c1 ...we have this counter-intuitive (based on the test diffs) opportunity to use 'psubus'. This appears to be the better perf option for both Haswell and Jaguar based on llvm-mca. We already do this transform for the SETULT predicate, so this makes the code more symmetrical too. If we have pminub/pminuw, we prefer those, so this should not affect anything but pre-SSE4.1 subtargets. $ cat before.s movdqa -16(%rip), %xmm2 ## xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] pxor %xmm0, %xmm2 pcmpgtw -32(%rip), %xmm2 ## xmm2 = [255,255,255,255,255,255,255,255] pand %xmm2, %xmm0 pandn %xmm1, %xmm2 por %xmm2, %xmm0 $ cat after.s movdqa -16(%rip), %xmm2 ## xmm2 = [256,256,256,256,256,256,256,256] psubusw %xmm0, %xmm2 pxor %xmm3, %xmm3 pcmpeqw %xmm2, %xmm3 pand %xmm3, %xmm0 pandn %xmm1, %xmm3 por %xmm3, %xmm0 $ llvm-mca before.s -mcpu=haswell Iterations: 100 Instructions: 600 Total Cycles: 909 Total uOps: 700 Dispatch Width: 4 uOps Per Cycle: 0.77 IPC: 0.66 Block RThroughput: 1.8 $ llvm-mca after.s -mcpu=haswell Iterations: 100 Instructions: 700 Total Cycles: 409 Total uOps: 700 Dispatch Width: 4 uOps Per Cycle: 1.71 IPC: 1.71 Block RThroughput: 1.8 Differential Revision: https://reviews.llvm.org/D60838 llvm-svn: 358999
author: Sanjay Patel <spatel@rotateright.com> 2019-04-23 15:20:17 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2019-04-23 15:20:17 +0000
commit: 12a561fa1b79c449d518100e1dd0dfce0a37b65d (patch)
tree: 37307c19c1e8a91b664f27954e291e817c54d7cf
parent: 6e7cc49d5cb31ee09b07252b6641d7c94977fd12 (diff)
download: bcm5719-llvm-12a561fa1b79c449d518100e1dd0dfce0a37b65d.tar.gz
bcm5719-llvm-12a561fa1b79c449d518100e1dd0dfce0a37b65d.zip
2 files changed, 30 insertions, 13 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6f8894060d6..04de45c8ad2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19747,10 +19747,11 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
 }
 
-/// Given a simple buildvector constant, return a new vector constant with each
-/// element decremented. If decrementing would result in underflow or this
-/// is not a simple vector constant, return an empty value.
-static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
+/// Given a buildvector constant, return a new vector constant with each element
+/// incremented or decremented. If incrementing or decrementing would result in
+/// unsigned overflow or underflow or this is not a simple vector constant,
+/// return an empty value.
+static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
   auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
   if (!BV)
     return SDValue();
@@ -19765,11 +19766,12 @@ static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
       return SDValue();
 
-    // Avoid underflow.
-    if (Elt->getAPIntValue().isNullValue())
+    // Avoid overflow/underflow.
+    const APInt &EltC = Elt->getAPIntValue();
+    if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
       return SDValue();
 
-    NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));
+    NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
   }
 
   return DAG.getBuildVector(VT, DL, NewVecC);
@@ -19801,12 +19803,24 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
     // Only do this pre-AVX since vpcmp* is no longer destructive.
     if (Subtarget.hasAVX())
       return SDValue();
-    SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);
+    SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false);
     if (!ULEOp1)
       return SDValue();
     Op1 = ULEOp1;
     break;
   }
+  case ISD::SETUGT: {
+    // If the comparison is against a constant, we can turn this into a setuge.
+    // This is beneficial because materializing a constant 0 for the PCMPEQ is
+    // probably cheaper than XOR+PCMPGT using 2 different vector constants:
+    // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
+    SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true);
+    if (!UGEOp1)
+      return SDValue();
+    Op1 = Op0;
+    Op0 = UGEOp1;
+    break;
+  }
   // Psubus is better than flip-sign because it requires no inversion.
   case ISD::SETUGE:
     std::swap(Op0, Op1);
diff --git a/llvm/test/CodeGen/X86/vec_setcc-2.ll b/llvm/test/CodeGen/X86/vec_setcc-2.ll
index 2a0a166ec9a..32d5c0b78a1 100644
--- a/llvm/test/CodeGen/X86/vec_setcc-2.ll
+++ b/llvm/test/CodeGen/X86/vec_setcc-2.ll
@@ -194,8 +194,10 @@ define <16 x i1> @ugt_v16i8_splat(<16 x i8> %x) {
 define <8 x i1> @ugt_v8i16_splat(<8 x i16> %x) {
 ; SSE2-LABEL: ugt_v8i16_splat:
 ; SSE2:       ## %bb.0:
-; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    pcmpgtw {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243]
+; SSE2-NEXT:    psubusw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: ugt_v8i16_splat:
@@ -541,9 +543,10 @@ define <4 x i1> @ugt_v4i32_splat_commute(<4 x i32> %x) {
 define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) {
 ; SSE2-LABEL: PR39859:
 ; SSE2:       ## %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtw {{.*}}(%rip), %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [43,43,43,43,43,43,43,43]
+; SSE2-NEXT:    psubusw %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pcmpeqw %xmm3, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm2
 ; SSE2-NEXT:    por %xmm1, %xmm2
author	Sanjay Patel <spatel@rotateright.com>	2019-04-23 15:20:17 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2019-04-23 15:20:17 +0000
commit	12a561fa1b79c449d518100e1dd0dfce0a37b65d (patch)
tree	37307c19c1e8a91b664f27954e291e817c54d7cf
parent	6e7cc49d5cb31ee09b07252b6641d7c94977fd12 (diff)
download	bcm5719-llvm-12a561fa1b79c449d518100e1dd0dfce0a37b65d.tar.gz bcm5719-llvm-12a561fa1b79c449d518100e1dd0dfce0a37b65d.zip