summaryrefslogtreecommitdiffstats
path: root/llvm/lib/CodeGen
diff options
context:
space:
mode:
authorEli Friedman <efriedma@codeaurora.org>2018-08-16 18:39:39 +0000
committerEli Friedman <efriedma@codeaurora.org>2018-08-16 18:39:39 +0000
commit73e8a784e62f945a51363c8b5ec4eaedcf9f87e8 (patch)
tree15afc4a44bf8511135900d38ebc78216dcb23b1b /llvm/lib/CodeGen
parentd1767dc56f5be75bdff23f3fe33e54428fed704f (diff)
downloadbcm5719-llvm-73e8a784e62f945a51363c8b5ec4eaedcf9f87e8.tar.gz
bcm5719-llvm-73e8a784e62f945a51363c8b5ec4eaedcf9f87e8.zip
[SelectionDAG] Improve the legalisation lowering of UMULO.
There is no way in the universe, that doing a full-width division in software will be faster than doing overflowing multiplication in software in the first place, especially given that this same full-width multiplication needs to be done anyway. This patch replaces the previous implementation with a direct lowering into an overflowing multiplication algorithm based on half-width operations. Correctness of the algorithm was verified by exhaustively checking the output of this algorithm for overflowing multiplication of 16 bit integers against an obviously correct widening multiplication. Baring any oversights introduced by porting the algorithm to DAG, confidence in correctness of this algorithm is extremely high. Following table shows the change in both t = runtime and s = space. The change is expressed as a multiplier of original, so anything under 1 is “better” and anything above 1 is worse. +-------+-----------+-----------+-------------+-------------+ | Arch | u64*u64 t | u64*u64 s | u128*u128 t | u128*u128 s | +-------+-----------+-----------+-------------+-------------+ | X64 | - | - | ~0.5 | ~0.64 | | i686 | ~0.5 | ~0.6666 | ~0.05 | ~0.9 | | armv7 | - | ~0.75 | - | ~1.4 | +-------+-----------+-----------+-------------+-------------+ Performance numbers have been collected by running overflowing multiplication in a loop under `perf` on two x86_64 (one Intel Haswell, other AMD Ryzen) based machines. Size numbers have been collected by looking at the size of function containing an overflowing multiply in a loop. All in all, it can be seen that both performance and size has improved except in the case of armv7 where code size has regressed for 128-bit multiply. u128*u128 overflowing multiply on 32-bit platforms seem to benefit from this change a lot, taking only 5% of the time compared to original algorithm to calculate the same thing. The final benefit of this change is that LLVM is now capable of lowering the overflowing unsigned multiply for integers of any bit-width as long as the target is capable of lowering regular multiplication for the same bit-width. Previously, 128-bit overflowing multiply was the widest possible. Patch by Simonas Kazlauskas! Differential Revision: https://reviews.llvm.org/D50310 llvm-svn: 339922
Diffstat (limited to 'llvm/lib/CodeGen')
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp65
1 files changed, 48 insertions, 17 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 432f4c6a4f2..1d7ea182717 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2705,25 +2705,56 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
EVT VT = N->getValueType(0);
SDLoc dl(N);
- // A divide for UMULO should be faster than a function call.
if (N->getOpcode() == ISD::UMULO) {
+ // This section expands the operation into the following sequence of
+ // instructions. `iNh` here refers to a type which has half the bit width of
+ // the type the original operation operated on.
+ //
+ // %0 = %LHS.HI != 0 && %RHS.HI != 0
+ // %1 = { iNh, i1 } @umul.with.overflow.iNh(iNh %LHS.HI, iNh %RHS.LO)
+ // %2 = { iNh, i1 } @umul.with.overflow.iNh(iNh %RHS.HI, iNh %LHS.LO)
+ // %3 = mul nuw iN (%LHS.LOW as iN), (%RHS.LOW as iN)
+ // %4 = add iN (%1.0 as iN) << Nh, (%2.0 as iN) << Nh
+ // %5 = { iN, i1 } @uadd.with.overflow.iN( %4, %3 )
+ //
+ // %res = { %5.0, %0 || %1.1 || %2.1 || %5.1 }
SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
-
- SDValue MUL = DAG.getNode(ISD::MUL, dl, LHS.getValueType(), LHS, RHS);
- SplitInteger(MUL, Lo, Hi);
-
- // A divide for UMULO will be faster than a function call. Select to
- // make sure we aren't using 0.
- SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(VT),
- RHS, DAG.getConstant(0, dl, VT), ISD::SETEQ);
- SDValue NotZero = DAG.getSelect(dl, VT, isZero,
- DAG.getConstant(1, dl, VT), RHS);
- SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero);
- SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS,
- ISD::SETNE);
- Overflow = DAG.getSelect(dl, N->getValueType(1), isZero,
- DAG.getConstant(0, dl, N->getValueType(1)),
- Overflow);
+ SDValue LHSHigh, LHSLow, RHSHigh, RHSLow;
+ SplitInteger(LHS, LHSLow, LHSHigh);
+ SplitInteger(RHS, RHSLow, RHSHigh);
+ EVT HalfVT = LHSLow.getValueType()
+ , BitVT = N->getValueType(1);
+ SDVTList VTHalfMulO = DAG.getVTList(HalfVT, BitVT);
+ SDVTList VTFullAddO = DAG.getVTList(VT, BitVT);
+
+ SDValue HalfZero = DAG.getConstant(0, dl, HalfVT);
+ SDValue Overflow = DAG.getNode(ISD::AND, dl, BitVT,
+ DAG.getSetCC(dl, BitVT, LHSHigh, HalfZero, ISD::SETNE),
+ DAG.getSetCC(dl, BitVT, RHSHigh, HalfZero, ISD::SETNE));
+
+ SDValue One = DAG.getNode(ISD::UMULO, dl, VTHalfMulO, LHSHigh, RHSLow);
+ Overflow = DAG.getNode(ISD::OR, dl, BitVT, Overflow, One.getValue(1));
+ SDValue OneInHigh = DAG.getNode(ISD::BUILD_PAIR, dl, VT, HalfZero,
+ One.getValue(0));
+
+ SDValue Two = DAG.getNode(ISD::UMULO, dl, VTHalfMulO, RHSHigh, LHSLow);
+ Overflow = DAG.getNode(ISD::OR, dl, BitVT, Overflow, Two.getValue(1));
+ SDValue TwoInHigh = DAG.getNode(ISD::BUILD_PAIR, dl, VT, HalfZero,
+ Two.getValue(0));
+
+ // Cannot use `UMUL_LOHI` directly, because some 32-bit targets (ARM) do not
+ // know how to expand `i64,i64 = umul_lohi a, b` and abort (why isn’t this
+ // operation recursively legalized?).
+ //
+ // Many backends understand this pattern and will convert into LOHI
+ // themselves, if applicable.
+ SDValue Three = DAG.getNode(ISD::MUL, dl, VT,
+ DAG.getNode(ISD::ZERO_EXTEND, dl, VT, LHSLow),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, VT, RHSLow));
+ SDValue Four = DAG.getNode(ISD::ADD, dl, VT, OneInHigh, TwoInHigh);
+ SDValue Five = DAG.getNode(ISD::UADDO, dl, VTFullAddO, Three, Four);
+ Overflow = DAG.getNode(ISD::OR, dl, BitVT, Overflow, Five.getValue(1));
+ SplitInteger(Five, Lo, Hi);
ReplaceValueWith(SDValue(N, 1), Overflow);
return;
}
OpenPOWER on IntegriCloud