diff options
| author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-05-23 19:54:48 +0000 |
|---|---|---|
| committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2017-05-23 19:54:48 +0000 |
| commit | 53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2 (patch) | |
| tree | 265408298e5da81dca03f5975884818c2a621cd4 /llvm/lib/Target/AMDGPU/SIISelLowering.cpp | |
| parent | e6366f505f3db54c7459bab0ff65fd9f5ddbab88 (diff) | |
| download | bcm5719-llvm-53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2.tar.gz bcm5719-llvm-53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2.zip | |
[AMDGPU] Combine and (srl) into shl (bfe)
Perform DAG combine:
and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
Where nb is a number of trailing zeroes in mask.
It replaces two instructions with two and BFE is generally a more
expensive one. However this is only done if we are selecting a byte
or word at an aligned boundary which results in a proper SDWA
operand pattern. It is only done if SDWA is supported.
TODO: improve SDWA pass to actually convert this pattern. It is not
done now because we have an immediate in the instruction, which has
be moved into a VGPR.
Differential Revision: https://reviews.llvm.org/D33455
llvm-svn: 303681
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 40 |
1 files changed, 34 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 01c1f78e7ca..8c939f4bf00 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4229,12 +4229,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, SDValue RHS = N->getOperand(1); - if (VT == MVT::i64) { - const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); - if (CRHS) { - if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) - return Split; + const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); + if (VT == MVT::i64 && CRHS) { + if (SDValue Split + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS)) + return Split; + } + + if (CRHS && VT == MVT::i32) { + // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb + // nb = number of trailing zeroes in mask + // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass, + // given that we are selecting 8 or 16 bit fields starting at byte boundary. + uint64_t Mask = CRHS->getZExtValue(); + unsigned Bits = countPopulation(Mask); + if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && + (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { + if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { + unsigned Shift = CShift->getZExtValue(); + unsigned NB = CRHS->getAPIntValue().countTrailingZeros(); + unsigned Offset = NB + Shift; + if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. + SDLoc SL(N); + SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + LHS->getOperand(0), + DAG.getConstant(Offset, SL, MVT::i32), + DAG.getConstant(Bits, SL, MVT::i32)); + EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits); + SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE, + DAG.getValueType(NarrowVT)); + SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext, + DAG.getConstant(NB, SDLoc(CRHS), MVT::i32)); + return Shl; + } + } } } |

