diff options
author | Sanjay Patel <spatel@rotateright.com> | 2019-02-28 19:47:04 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2019-02-28 19:47:04 +0000 |
commit | 7fc6ef7dd7433c1947e1a00db0c32d1c68bf385f (patch) | |
tree | 8c5e21d6735a77017cca252210db17bf6c0c6584 /llvm/lib | |
parent | fadb22f4e2d64b614c87f48b625adaf217cc904f (diff) | |
download | bcm5719-llvm-7fc6ef7dd7433c1947e1a00db0c32d1c68bf385f.tar.gz bcm5719-llvm-7fc6ef7dd7433c1947e1a00db0c32d1c68bf385f.zip |
[x86] scalarize extract element 0 of FP math
This is another step towards ensuring that we produce the optimal code for reductions,
but there are other potential benefits as seen in the tests diffs:
1. Memory loads may get scalarized resulting in more efficient code.
2. Memory stores may get scalarized resulting in more efficient code.
3. Complex ops like fdiv/sqrt get scalarized which may be faster instructions depending on uarch.
4. Even simple ops like addss/subss/mulss/roundss may result in faster operation/less frequency throttling when scalarized depending on uarch.
The TODO comment suggests 1 or more follow-ups for opcodes that can currently result in regressions.
Differential Revision: https://reviews.llvm.org/D58282
llvm-svn: 355130
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ac49ba02351..de666798b03 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34240,6 +34240,62 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Extracting a scalar FP value from vector element 0 is free, so extract each +/// operand first, then perform the math as a scalar op. +static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { + assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract"); + SDValue Vec = ExtElt->getOperand(0); + SDValue Index = ExtElt->getOperand(1); + EVT VT = ExtElt->getValueType(0); + EVT VecVT = Vec.getValueType(); + + // TODO: If this is a unary/expensive/expand op, allow extraction from a + // non-zero element because the shuffle+scalar op will be cheaper? + if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT) + return SDValue(); + + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + + // TODO: This switch could include FNEG, the x86-specific FP logic ops + // (FAND, FANDN, FOR, FXOR), FRSQRT/FRCP and other FP math ops. But that may + // require enhancements to avoid missed load folding and fma+fneg combining. + switch (Vec.getOpcode()) { + case ISD::FMA: // Begin 3 operands + case ISD::FMAD: + case ISD::FADD: // Begin 2 operands + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FCOPYSIGN: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: + case ISD::FMAXIMUM: + case ISD::FMINIMUM: + case ISD::FABS: // Begin 1 operand + case ISD::FSQRT: + case ISD::FRINT: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: { + // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ... + SDLoc DL(ExtElt); + SmallVector<SDValue, 4> ExtOps; + for (SDValue Op : Vec->ops()) + ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index)); + return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps); + } + default: + return SDValue(); + } + llvm_unreachable("All opcodes should return within switch"); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading @@ -34310,6 +34366,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; + if (SDValue V = scalarizeExtEltFP(N, DAG)) + return V; + return SDValue(); } |