summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>2017-05-23 19:54:48 +0000
committerStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>2017-05-23 19:54:48 +0000
commit53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2 (patch)
tree265408298e5da81dca03f5975884818c2a621cd4 /llvm/lib/Target/AMDGPU
parente6366f505f3db54c7459bab0ff65fd9f5ddbab88 (diff)
downloadbcm5719-llvm-53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2.tar.gz
bcm5719-llvm-53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2.zip
[AMDGPU] Combine and (srl) into shl (bfe)
Perform DAG combine: and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb Where nb is a number of trailing zeroes in mask. It replaces two instructions with two and BFE is generally a more expensive one. However this is only done if we are selecting a byte or word at an aligned boundary which results in a proper SDWA operand pattern. It is only done if SDWA is supported. TODO: improve SDWA pass to actually convert this pattern. It is not done now because we have an immediate in the instruction, which has be moved into a VGPR. Differential Revision: https://reviews.llvm.org/D33455 llvm-svn: 303681
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h8
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp40
3 files changed, 40 insertions, 11 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9d7562d176d..723e8a7b54e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3478,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DL);
}
- if ((OffsetVal + WidthVal) >= 32) {
+ if ((OffsetVal + WidthVal) >= 32 &&
+ !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
BitsFrom, ShiftVal);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index d122af0e27b..24ebde5dc93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -420,6 +420,10 @@ public:
return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
}
+ bool hasSDWA() const {
+ return HasSDWA;
+ }
+
/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
@@ -674,10 +678,6 @@ public:
return HasInv2PiInlineImm;
}
- bool hasSDWA() const {
- return HasSDWA;
- }
-
bool hasDPP() const {
return HasDPP;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 01c1f78e7ca..8c939f4bf00 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4229,12 +4229,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
SDValue RHS = N->getOperand(1);
- if (VT == MVT::i64) {
- const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
- if (CRHS) {
- if (SDValue Split
- = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
- return Split;
+ const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+ if (VT == MVT::i64 && CRHS) {
+ if (SDValue Split
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+ return Split;
+ }
+
+ if (CRHS && VT == MVT::i32) {
+ // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
+ // nb = number of trailing zeroes in mask
+ // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
+ // given that we are selecting 8 or 16 bit fields starting at byte boundary.
+ uint64_t Mask = CRHS->getZExtValue();
+ unsigned Bits = countPopulation(Mask);
+ if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
+ (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
+ if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+ unsigned Shift = CShift->getZExtValue();
+ unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
+ unsigned Offset = NB + Shift;
+ if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
+ SDLoc SL(N);
+ SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+ LHS->getOperand(0),
+ DAG.getConstant(Offset, SL, MVT::i32),
+ DAG.getConstant(Bits, SL, MVT::i32));
+ EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+ SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
+ DAG.getValueType(NarrowVT));
+ SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
+ DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
+ return Shl;
+ }
+ }
}
}
OpenPOWER on IntegriCloud