diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2019-04-23 15:20:17 +0000 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2019-04-23 15:20:17 +0000 |
| commit | 12a561fa1b79c449d518100e1dd0dfce0a37b65d (patch) | |
| tree | 37307c19c1e8a91b664f27954e291e817c54d7cf | |
| parent | 6e7cc49d5cb31ee09b07252b6641d7c94977fd12 (diff) | |
| download | bcm5719-llvm-12a561fa1b79c449d518100e1dd0dfce0a37b65d.tar.gz bcm5719-llvm-12a561fa1b79c449d518100e1dd0dfce0a37b65d.zip | |
[x86] use psubus for more vsetcc lowering (PR39859)
Circling back to a leftover bit from PR39859:
https://bugs.llvm.org/show_bug.cgi?id=39859#c1
...we have this counter-intuitive (based on the test diffs) opportunity to use 'psubus'.
This appears to be the better perf option for both Haswell and Jaguar based on llvm-mca.
We already do this transform for the SETULT predicate, so this makes the code more
symmetrical too. If we have pminub/pminuw, we prefer those, so this should not affect
anything but pre-SSE4.1 subtargets.
$ cat before.s
movdqa -16(%rip), %xmm2 ## xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
pxor %xmm0, %xmm2
pcmpgtw -32(%rip), %xmm2 ## xmm2 = [255,255,255,255,255,255,255,255]
pand %xmm2, %xmm0
pandn %xmm1, %xmm2
por %xmm2, %xmm0
$ cat after.s
movdqa -16(%rip), %xmm2 ## xmm2 = [256,256,256,256,256,256,256,256]
psubusw %xmm0, %xmm2
pxor %xmm3, %xmm3
pcmpeqw %xmm2, %xmm3
pand %xmm3, %xmm0
pandn %xmm1, %xmm3
por %xmm3, %xmm0
$ llvm-mca before.s -mcpu=haswell
Iterations: 100
Instructions: 600
Total Cycles: 909
Total uOps: 700
Dispatch Width: 4
uOps Per Cycle: 0.77
IPC: 0.66
Block RThroughput: 1.8
$ llvm-mca after.s -mcpu=haswell
Iterations: 100
Instructions: 700
Total Cycles: 409
Total uOps: 700
Dispatch Width: 4
uOps Per Cycle: 1.71
IPC: 1.71
Block RThroughput: 1.8
Differential Revision: https://reviews.llvm.org/D60838
llvm-svn: 358999
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 30 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_setcc-2.ll | 13 |
2 files changed, 30 insertions, 13 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6f8894060d6..04de45c8ad2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19747,10 +19747,11 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); } -/// Given a simple buildvector constant, return a new vector constant with each -/// element decremented. If decrementing would result in underflow or this -/// is not a simple vector constant, return an empty value. -static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) { +/// Given a buildvector constant, return a new vector constant with each element +/// incremented or decremented. If incrementing or decrementing would result in +/// unsigned overflow or underflow or this is not a simple vector constant, +/// return an empty value. +static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) { auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode()); if (!BV) return SDValue(); @@ -19765,11 +19766,12 @@ static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) { if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT) return SDValue(); - // Avoid underflow. - if (Elt->getAPIntValue().isNullValue()) + // Avoid overflow/underflow. + const APInt &EltC = Elt->getAPIntValue(); + if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue())) return SDValue(); - NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT)); + NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT)); } return DAG.getBuildVector(VT, DL, NewVecC); @@ -19801,12 +19803,24 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, // Only do this pre-AVX since vpcmp* is no longer destructive. if (Subtarget.hasAVX()) return SDValue(); - SDValue ULEOp1 = decrementVectorConstant(Op1, DAG); + SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, false); if (!ULEOp1) return SDValue(); Op1 = ULEOp1; break; } + case ISD::SETUGT: { + // If the comparison is against a constant, we can turn this into a setuge. + // This is beneficial because materializing a constant 0 for the PCMPEQ is + // probably cheaper than XOR+PCMPGT using 2 different vector constants: + // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 + SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, true); + if (!UGEOp1) + return SDValue(); + Op1 = Op0; + Op0 = UGEOp1; + break; + } // Psubus is better than flip-sign because it requires no inversion. case ISD::SETUGE: std::swap(Op0, Op1); diff --git a/llvm/test/CodeGen/X86/vec_setcc-2.ll b/llvm/test/CodeGen/X86/vec_setcc-2.ll index 2a0a166ec9a..32d5c0b78a1 100644 --- a/llvm/test/CodeGen/X86/vec_setcc-2.ll +++ b/llvm/test/CodeGen/X86/vec_setcc-2.ll @@ -194,8 +194,10 @@ define <16 x i1> @ugt_v16i8_splat(<16 x i8> %x) { define <8 x i1> @ugt_v8i16_splat(<8 x i16> %x) { ; SSE2-LABEL: ugt_v8i16_splat: ; SSE2: ## %bb.0: -; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [243,243,243,243,243,243,243,243] +; SSE2-NEXT: psubusw %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: ugt_v8i16_splat: @@ -541,9 +543,10 @@ define <4 x i1> @ugt_v4i32_splat_commute(<4 x i32> %x) { define <8 x i16> @PR39859(<8 x i16> %x, <8 x i16> %y) { ; SSE2-LABEL: PR39859: ; SSE2: ## %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtw {{.*}}(%rip), %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [43,43,43,43,43,43,43,43] +; SSE2-NEXT: psubusw %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpeqw %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 |

