diff options
| author | Zvi Rackover <zvi.rackover@intel.com> | 2017-10-09 20:01:10 +0000 |
|---|---|---|
| committer | Zvi Rackover <zvi.rackover@intel.com> | 2017-10-09 20:01:10 +0000 |
| commit | c1d5955684dba9a31cb0ff3b4b61fe2a84e392e8 (patch) | |
| tree | e7d0f5739578bc8ac1a3779cc51864a96e5e9e9a /llvm/lib/Target/X86 | |
| parent | 663ba15ed6cf3509ef5b77bec985e06666d95a10 (diff) | |
| download | bcm5719-llvm-c1d5955684dba9a31cb0ff3b4b61fe2a84e392e8.tar.gz bcm5719-llvm-c1d5955684dba9a31cb0ff3b4b61fe2a84e392e8.zip | |
[X86] Unsigned saturation subtraction canonicalization [the backend part]
Summary:
On behalf of julia.koval@intel.com
The patch transforms canonical version of unsigned saturation, which is sub(max(a,b),a) or sub(a,min(a,b)) to special psubus insturuction on targets, which support it(8bit and 16bit uints).
umax(a,b) - b -> subus(a,b)
a - umin(a,b) -> subus(a,b)
There is also extra case handled, when right part of sub is 32 bit and can be truncated, using UMIN(this transformation was discussed in https://reviews.llvm.org/D25987).
The example of special case code:
```
void foo(unsigned short *p, int max, int n) {
int i;
unsigned m;
for (i = 0; i < n; i++) {
m = *--p;
*p = (unsigned short)(m >= max ? m-max : 0);
}
}
```
Max in this example is truncated to max_short value, if it is greater than m, or just truncated to 16 bit, if it is not. It is vaid transformation, because if max > max_short, result of the expression will be zero.
Here is the table of types, I try to support, special case items are bold:
| Size | 128 | 256 | 512
| ----- | ----- | ----- | -----
| i8 | v16i8 | v32i8 | v64i8
| i16 | v8i16 | v16i16 | v32i16
| i32 | | **v8i32** | **v16i32**
| i64 | | | **v8i64**
Reviewers: zvi, spatel, DavidKreitzer, RKSimon
Reviewed By: zvi
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D37534
llvm-svn: 315237
Diffstat (limited to 'llvm/lib/Target/X86')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7e9ec7f1e42..0f6a27ed9e7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35901,6 +35901,89 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, return combineAddOrSubToADCOrSBB(N, DAG); } +static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // PSUBUS is supported, starting from SSE2, but special preprocessing + // for v8i32 requires umin, which appears in SSE41. + if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && + !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) && + !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) && + !(Subtarget.hasAVX512() && Subtarget.hasBWI() && + (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 || + VT == MVT::v8i64))) + return SDValue(); + + SDValue SubusLHS, SubusRHS; + // Try to find umax(a,b) - b or a - umin(a,b) patterns + // they may be converted to subus(a,b). + // TODO: Need to add IR cannonicialization for this code. + if (Op0.getOpcode() == ISD::UMAX) { + SubusRHS = Op1; + SDValue MaxLHS = Op0.getOperand(0); + SDValue MaxRHS = Op0.getOperand(1); + if (DAG.isEqualTo(MaxLHS, Op1)) + SubusLHS = MaxRHS; + else if (DAG.isEqualTo(MaxRHS, Op1)) + SubusLHS = MaxLHS; + else + return SDValue(); + } else if (Op1.getOpcode() == ISD::UMIN) { + SubusLHS = Op0; + SDValue MinLHS = Op1.getOperand(0); + SDValue MinRHS = Op1.getOperand(1); + if (DAG.isEqualTo(MinLHS, Op0)) + SubusRHS = MinRHS; + else if (DAG.isEqualTo(MinRHS, Op0)) + SubusRHS = MinLHS; + else + return SDValue(); + } else + return SDValue(); + + // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with + // special preprocessing in some cases. + if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64) + return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS); + + // Special preprocessing case can be only applied + // if the value was zero extended from 16 bit, + // so we require first 16 bits to be zeros for 32 bit + // values, or first 48 bits for 64 bit values. + KnownBits Known; + DAG.computeKnownBits(SubusLHS, Known); + unsigned NumZeros = Known.countMinLeadingZeros(); + if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16) + return SDValue(); + + EVT ExtType = SubusLHS.getValueType(); + EVT ShrinkedType; + if (VT == MVT::v8i32 || VT == MVT::v8i64) + ShrinkedType = MVT::v8i16; + else + ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16; + + // If SubusLHS is zeroextended - truncate SubusRHS to it's + // size SubusRHS = umin(0xFFF.., SubusRHS). + SDValue SaturationConst = + DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(), + ShrinkedType.getScalarSizeInBits()), + SDLoc(SubusLHS), ExtType); + SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS, + SaturationConst); + SDValue NewSubusLHS = + DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType); + SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType); + SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType, + NewSubusLHS, NewSubusRHS); + // Zero extend the result, it may be used somewhere as 32 bit, + // if not zext and following trunc will shrink. + return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType); +} + static SDValue combineSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); @@ -35934,6 +36017,10 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineIncDecVector(N, DAG)) return V; + // Try to create PSUBUS if SUB's argument is max/min + if (SDValue V = combineSubToSubus(N, DAG, Subtarget)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } |

