diff options
author | Pierre Gousseau <pierregousseau14@gmail.com> | 2016-10-14 16:41:38 +0000 |
---|---|---|
committer | Pierre Gousseau <pierregousseau14@gmail.com> | 2016-10-14 16:41:38 +0000 |
commit | b6d652adb5b12b7d1fc7e973a5afc019875cb547 (patch) | |
tree | a8196cccb70b3ebb8eba63d17f04efa2f7f31fde /llvm/lib | |
parent | 6d6eca5cdc995fdb8850fd5c79d1018893a44988 (diff) | |
download | bcm5719-llvm-b6d652adb5b12b7d1fc7e973a5afc019875cb547.tar.gz bcm5719-llvm-b6d652adb5b12b7d1fc7e973a5afc019875cb547.zip |
[X86] Take advantage of the lzcnt instruction on btver2 architectures when ORing comparisons to zero.
This change adds transformations such as:
zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
To:
srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
This optimisation is beneficial on Jaguar architecture only, where lzcnt has a good reciprocal throughput.
Other architectures such as Intel's Haswell/Broadwell or AMD's Bulldozer/PileDriver do not benefit from it.
For this reason the change also adds a "HasFastLZCNT" feature which gets enabled for Jaguar.
Differential Revision: https://reviews.llvm.org/D23446
llvm-svn: 284248
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86.td | 7 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 114 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 2 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.td | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 4 |
6 files changed, 129 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 9b6fe6893de..9d48af96471 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -262,6 +262,12 @@ def FeatureFastScalarFSQRT def FeatureFastVectorFSQRT : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT", "true", "Vector SQRT is fast (disable Newton-Raphson)">; +// If lzcnt has equivalent latency/throughput to most simple integer ops, it can +// be used to replace test/set sequences. +def FeatureFastLZCNT + : SubtargetFeature< + "fast-lzcnt", "HasFastLZCNT", "true", + "LZCNT instructions are as fast as most simple integer ops">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -646,6 +652,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [ FeatureF16C, FeatureMOVBE, FeatureLZCNT, + FeatureFastLZCNT, FeaturePOPCNT, FeatureXSAVE, FeatureXSAVEOPT, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d02dc761db0..db68237eecb 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4178,6 +4178,10 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasLZCNT(); } +bool X86TargetLowering::isCtlzFast() const { + return Subtarget.hasFastLZCNT(); +} + bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { if (!Subtarget.hasBMI()) return false; @@ -29090,6 +29094,113 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT, Mask); } +// Helper function for combineOrCmpEqZeroToCtlzSrl +// Transforms: +// seteq(cmp x, 0) +// into: +// srl(ctlz x), log2(bitsize(x)) +// Input pattern is checked by caller. +SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy, SelectionDAG &DAG) { + SDValue Cmp = Op.getOperand(1); + EVT VT = Cmp.getOperand(0).getValueType(); + unsigned Log2b = Log2_32(VT.getSizeInBits()); + SDLoc dl(Op); + SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0)); + // The result of the shift is true or false, and on X86, the 32-bit + // encoding of shr and lzcnt is more desirable. + SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32); + SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc, + DAG.getConstant(Log2b, dl, VT)); + return DAG.getZExtOrTrunc(Scc, dl, ExtTy); +} + +// Try to transform: +// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0)))) +// into: +// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x)) +// Will also attempt to match more generic cases, eg: +// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0))) +// Only applies if the target supports the FastLZCNT feature. +static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast()) + return SDValue(); + + auto isORCandidate = [](SDValue N) { + return (N->getOpcode() == ISD::OR && N->hasOneUse()); + }; + + // Check the zero extend is extending to 32-bit or more. The code generated by + // srl(ctlz) for 16-bit or less variants of the pattern would require extra + // instructions to clear the upper bits. + if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) || + !isORCandidate(N->getOperand(0))) + return SDValue(); + + // Check the node matches: setcc(eq, cmp 0) + auto isSetCCCandidate = [](SDValue N) { + return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() && + X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E && + N->getOperand(1).getOpcode() == X86ISD::CMP && + N->getOperand(1).getConstantOperandVal(1) == 0 && + N->getOperand(1).getValueType().bitsGE(MVT::i32); + }; + + SDNode *OR = N->getOperand(0).getNode(); + SDValue LHS = OR->getOperand(0); + SDValue RHS = OR->getOperand(1); + + // Save nodes matching or(or, setcc(eq, cmp 0)). + SmallVector<SDNode *, 2> ORNodes; + while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) || + (isORCandidate(RHS) && isSetCCCandidate(LHS)))) { + ORNodes.push_back(OR); + OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode(); + LHS = OR->getOperand(0); + RHS = OR->getOperand(1); + } + + // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)). + if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) || + !isORCandidate(SDValue(OR, 0))) + return SDValue(); + + // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it + // to + // or(srl(ctlz),srl(ctlz)). + // The dag combiner can then fold it into: + // srl(or(ctlz, ctlz)). + EVT VT = OR->getValueType(0); + SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG); + SDValue Ret, NewRHS; + if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG))) + Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS); + + if (!Ret) + return SDValue(); + + // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern. + while (ORNodes.size() > 0) { + OR = ORNodes.pop_back_val(); + LHS = OR->getOperand(0); + RHS = OR->getOperand(1); + // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). + if (RHS->getOpcode() == ISD::OR) + std::swap(LHS, RHS); + EVT VT = OR->getValueType(0); + SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG); + if (!NewRHS) + return SDValue(); + Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS); + } + + if (Ret) + Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); + + return Ret; +} + static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -31121,6 +31232,9 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) return NewAdd; + if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) + return R; + return SDValue(); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index b193dc21113..3fa22832c21 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -771,6 +771,8 @@ namespace llvm { bool isCheapToSpeculateCtlz() const override; + bool isCtlzFast() const override; + bool hasBitPreservingFPLogic(EVT VT) const override { return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); } diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 3cbcfa6c723..04761380971 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -890,6 +890,7 @@ def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; +def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 156c0b99a2b..91fdc5afe8a 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -284,6 +284,7 @@ void X86Subtarget::initializeEnvironment() { HasFastPartialYMMWrite = false; HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; + HasFastLZCNT = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index a5cd83d2514..434edf63c6e 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -215,6 +215,9 @@ protected: /// 64-bit divisions and should be used when possible. bool HasSlowDivide64; + /// True if LZCNT instruction is fast. + bool HasFastLZCNT; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -444,6 +447,7 @@ public: bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; } bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } + bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } |