[AMDGPU] combine extractelement into several selects

An extractelement with non-constant index will be lowered either to scratch or movrel loop in most cases. This patch converts such instruction into a set of selects if vector size is not too big. Differential Revision: https://reviews.llvm.org/D54351 llvm-svn: 346800
author: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2018-11-13 21:18:21 +0000
committer: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> 2018-11-13 21:18:21 +0000
commit: bcb34ac2ea6c7232a86f6da569afe3d6dbccd7c4 (patch)
tree: d2011b76c57899a48bfb716d6097dcc660c478aa /llvm/lib/Target/AMDGPU
parent: c2078fb1c8b63557d3b78af4b8897c617b1804df (diff)
download: bcm5719-llvm-bcb34ac2ea6c7232a86f6da569afe3d6dbccd7c4.tar.gz
bcm5719-llvm-bcb34ac2ea6c7232a86f6da569afe3d6dbccd7c4.zip
1 files changed, 26 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 254f1362f1f..39e5948eca7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8025,7 +8025,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
 
     switch(Opc) {
     default:
-      return SDValue();
+      break;
       // TODO: Support other binary operations.
     case ISD::FADD:
     case ISD::FSUB:
@@ -8051,12 +8051,34 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
     }
   }
 
-  if (!DCI.isBeforeLegalize())
-    return SDValue();
-
   unsigned VecSize = VecVT.getSizeInBits();
   unsigned EltSize = EltVT.getSizeInBits();
 
+  // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
+  // This elminates non-constant index and subsequent movrel or scratch access.
+  // Sub-dword vectors of size 2 dword or less have better implementation.
+  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
+  // instructions.
+  if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
+      !isa<ConstantSDNode>(N->getOperand(1))) {
+    SDLoc SL(N);
+    SDValue Idx = N->getOperand(1);
+    EVT IdxVT = Idx.getValueType();
+    SDValue V;
+    for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+      SDValue IC = DAG.getConstant(I, SL, IdxVT);
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
+      if (I == 0)
+        V = Elt;
+      else
+        V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
+    }
+    return V;
+  }
+
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
   // elements. This exposes more load reduction opportunities by replacing
   // multiple small extract_vector_elements with a single 32-bit extract.
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2018-11-13 21:18:21 +0000
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>	2018-11-13 21:18:21 +0000
commit	bcb34ac2ea6c7232a86f6da569afe3d6dbccd7c4 (patch)
tree	d2011b76c57899a48bfb716d6097dcc660c478aa /llvm/lib/Target/AMDGPU
parent	c2078fb1c8b63557d3b78af4b8897c617b1804df (diff)
download	bcm5719-llvm-bcb34ac2ea6c7232a86f6da569afe3d6dbccd7c4.tar.gz bcm5719-llvm-bcb34ac2ea6c7232a86f6da569afe3d6dbccd7c4.zip