summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2017-01-04 08:05:42 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2017-01-04 08:05:42 +0000
commitc76ea4b6381f120acaef60a183b2c570d125ef7d (patch)
tree21380d2ba3b729d36c3cb25ac40360d0020503bd /llvm/lib/Target
parentd0aa53b9ae0b1bd1b093f3d40e0a0e9e128dedd2 (diff)
downloadbcm5719-llvm-c76ea4b6381f120acaef60a183b2c570d125ef7d.tar.gz
bcm5719-llvm-c76ea4b6381f120acaef60a183b2c570d125ef7d.zip
[X86] Attempt to pre-truncate arithmetic operations if useful
In some cases its more efficient to combine TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) if the binop is legal for the truncated types. This is true for vector integer multiplication (especially vXi64), as well as ADD/AND/XOR/OR in cases where we only need to truncate one of the inputs at runtime (e.g. a duplicated input or an one use constant we can fold). Further work could be done here - scalar cases (especially i64) could often benefit (if we avoid partial registers etc.), other opcodes, and better analysis of when truncating the inputs reduces costs. I have considered implementing this for all targets within the DAGCombiner but wasn't sure we could devise a suitable cost model system that would give us the range we need. Differential Revision: https://reviews.llvm.org/D28219 llvm-svn: 290947
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp81
1 files changed, 81 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3351f1321c8..0c1f9e6cffa 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31833,6 +31833,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
+/// the codegen.
+/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
+static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDLoc &DL) {
+ assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
+ SDValue Src = N->getOperand(0);
+ unsigned Opcode = Src.getOpcode();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+
+ auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
+ // TODO: Add extra cases where we can truncate both inputs for the
+ // cost of one (or none).
+ // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+ if (Op0 == Op1)
+ return true;
+
+ SDValue BC0 = peekThroughOneUseBitcasts(Op0);
+ SDValue BC1 = peekThroughOneUseBitcasts(Op1);
+ return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
+ ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
+ };
+
+ auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
+ SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
+ SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+ return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
+ };
+
+ // Don't combine if the operation has other uses.
+ if (!N->isOnlyUserOf(Src.getNode()))
+ return SDValue();
+
+ // Only support vector truncation for now.
+ // TODO: i64 scalar math would benefit as well.
+ if (!VT.isVector())
+ return SDValue();
+
+ // In most cases its only worth pre-truncating if we're only facing the cost
+ // of one truncation.
+ // i.e. if one of the inputs will constant fold or the input is repeated.
+ switch (Opcode) {
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR: {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
+ IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+
+ case ISD::MUL:
+ // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
+ // better to truncate if we have the chance.
+ if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
+ !TLI.isOperationLegal(Opcode, SrcVT))
+ return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
+ LLVM_FALLTHROUGH;
+ case ISD::ADD: {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegal(Opcode, VT) &&
+ IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
static SDValue
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
@@ -32019,6 +32096,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
SDValue Src = N->getOperand(0);
SDLoc DL(N);
+ // Attempt to pre-truncate inputs to arithmetic ops instead.
+ if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
+ return V;
+
// Try to detect AVG pattern first.
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
OpenPOWER on IntegriCloud