summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
authorZvi Rackover <zvi.rackover@intel.com>2017-10-09 20:01:10 +0000
committerZvi Rackover <zvi.rackover@intel.com>2017-10-09 20:01:10 +0000
commitc1d5955684dba9a31cb0ff3b4b61fe2a84e392e8 (patch)
treee7d0f5739578bc8ac1a3779cc51864a96e5e9e9a /llvm/lib/Target/X86
parent663ba15ed6cf3509ef5b77bec985e06666d95a10 (diff)
downloadbcm5719-llvm-c1d5955684dba9a31cb0ff3b4b61fe2a84e392e8.tar.gz
bcm5719-llvm-c1d5955684dba9a31cb0ff3b4b61fe2a84e392e8.zip
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary: On behalf of julia.koval@intel.com The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints). umax(a,b) - b -> subus(a,b) a - umin(a,b) -> subus(a,b) There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987). The example of special case code: ``` void foo(unsigned short *p, int max, int n) { int i; unsigned m; for (i = 0; i < n; i++) { m = *--p; *p = (unsigned short)(m >= max ? m-max : 0); } } ``` Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero. Here is the table of types, I try to support, special case items are bold: | Size | 128 | 256 | 512 | ----- | ----- | ----- | ----- | i8 | v16i8 | v32i8 | v64i8 | i16 | v8i16 | v16i16 | v32i16 | i32 | | **v8i32** | **v16i32** | i64 | | | **v8i64** Reviewers: zvi, spatel, DavidKreitzer, RKSimon Reviewed By: zvi Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D37534 llvm-svn: 315237
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp87
1 files changed, 87 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7e9ec7f1e42..0f6a27ed9e7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35901,6 +35901,89 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
return combineAddOrSubToADCOrSBB(N, DAG);
}
+static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ // PSUBUS is supported, starting from SSE2, but special preprocessing
+ // for v8i32 requires umin, which appears in SSE41.
+ if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
+ !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
+ !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
+ !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
+ (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
+ VT == MVT::v8i64)))
+ return SDValue();
+
+ SDValue SubusLHS, SubusRHS;
+ // Try to find umax(a,b) - b or a - umin(a,b) patterns
+ // they may be converted to subus(a,b).
+ // TODO: Need to add IR cannonicialization for this code.
+ if (Op0.getOpcode() == ISD::UMAX) {
+ SubusRHS = Op1;
+ SDValue MaxLHS = Op0.getOperand(0);
+ SDValue MaxRHS = Op0.getOperand(1);
+ if (DAG.isEqualTo(MaxLHS, Op1))
+ SubusLHS = MaxRHS;
+ else if (DAG.isEqualTo(MaxRHS, Op1))
+ SubusLHS = MaxLHS;
+ else
+ return SDValue();
+ } else if (Op1.getOpcode() == ISD::UMIN) {
+ SubusLHS = Op0;
+ SDValue MinLHS = Op1.getOperand(0);
+ SDValue MinRHS = Op1.getOperand(1);
+ if (DAG.isEqualTo(MinLHS, Op0))
+ SubusRHS = MinRHS;
+ else if (DAG.isEqualTo(MinRHS, Op0))
+ SubusRHS = MinLHS;
+ else
+ return SDValue();
+ } else
+ return SDValue();
+
+ // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
+ // special preprocessing in some cases.
+ if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
+ return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
+
+ // Special preprocessing case can be only applied
+ // if the value was zero extended from 16 bit,
+ // so we require first 16 bits to be zeros for 32 bit
+ // values, or first 48 bits for 64 bit values.
+ KnownBits Known;
+ DAG.computeKnownBits(SubusLHS, Known);
+ unsigned NumZeros = Known.countMinLeadingZeros();
+ if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
+ return SDValue();
+
+ EVT ExtType = SubusLHS.getValueType();
+ EVT ShrinkedType;
+ if (VT == MVT::v8i32 || VT == MVT::v8i64)
+ ShrinkedType = MVT::v8i16;
+ else
+ ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
+
+ // If SubusLHS is zeroextended - truncate SubusRHS to it's
+ // size SubusRHS = umin(0xFFF.., SubusRHS).
+ SDValue SaturationConst =
+ DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
+ ShrinkedType.getScalarSizeInBits()),
+ SDLoc(SubusLHS), ExtType);
+ SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
+ SaturationConst);
+ SDValue NewSubusLHS =
+ DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
+ SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
+ SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
+ NewSubusLHS, NewSubusRHS);
+ // Zero extend the result, it may be used somewhere as 32 bit,
+ // if not zext and following trunc will shrink.
+ return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
+}
+
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
@@ -35934,6 +36017,10 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineIncDecVector(N, DAG))
return V;
+ // Try to create PSUBUS if SUB's argument is max/min
+ if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
+ return V;
+
return combineAddOrSubToADCOrSBB(N, DAG);
}
OpenPOWER on IntegriCloud