[AMDGPU] Combine and (srl) into shl (bfe)

Perform DAG combine: and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb Where nb is a number of trailing zeroes in mask. It replaces two instructions with two and BFE is generally a more expensive one. However this is only done if we are selecting a byte or word at an aligned boundary which results in a proper SDWA operand pattern. It is only done if SDWA is supported. TODO: improve SDWA pass to actually convert this pattern. It is not done now because we have an immediate in the instruction, which has be moved into a VGPR. Differential Revision: https://reviews.llvm.org/D33455 llvm-svn: 303681
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2017-05-23 19:54:48 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2017-05-23 19:54:48 +0000
commit: 53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2 (patch)
tree: 265408298e5da81dca03f5975884818c2a621cd4 /llvm/lib/Target/AMDGPU/SIISelLowering.cpp
parent: e6366f505f3db54c7459bab0ff65fd9f5ddbab88 (diff)
download: bcm5719-llvm-53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2.tar.gz
bcm5719-llvm-53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2.zip
1 files changed, 34 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 01c1f78e7ca..8c939f4bf00 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4229,12 +4229,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
   SDValue RHS = N->getOperand(1);
 
 
-  if (VT == MVT::i64) {
-    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
-    if (CRHS) {
-      if (SDValue Split
-          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
-        return Split;
+  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (VT == MVT::i64 && CRHS) {
+    if (SDValue Split
+        = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+      return Split;
+  }
+
+  if (CRHS && VT == MVT::i32) {
+    // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
+    // nb = number of trailing zeroes in mask
+    // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
+    // given that we are selecting 8 or 16 bit fields starting at byte boundary.
+    uint64_t Mask = CRHS->getZExtValue();
+    unsigned Bits = countPopulation(Mask);
+    if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
+        (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
+      if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+        unsigned Shift = CShift->getZExtValue();
+        unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
+        unsigned Offset = NB + Shift;
+        if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
+          SDLoc SL(N);
+          SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+                                    LHS->getOperand(0),
+                                    DAG.getConstant(Offset, SL, MVT::i32),
+                                    DAG.getConstant(Bits, SL, MVT::i32));
+          EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+          SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
+                                    DAG.getValueType(NarrowVT));
+          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
+                                    DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
+          return Shl;
+        }
+      }
     }
   }
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2017-05-23 19:54:48 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2017-05-23 19:54:48 +0000
commit	53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2 (patch)
tree	265408298e5da81dca03f5975884818c2a621cd4 /llvm/lib/Target/AMDGPU/SIISelLowering.cpp
parent	e6366f505f3db54c7459bab0ff65fd9f5ddbab88 (diff)
download	bcm5719-llvm-53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2.tar.gz bcm5719-llvm-53a21292f87d85e9a26c12a53fdbf8b3aeb66cc2.zip