diff options
| author | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2018-11-13 21:18:21 +0000 | 
|---|---|---|
| committer | Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> | 2018-11-13 21:18:21 +0000 | 
| commit | bcb34ac2ea6c7232a86f6da569afe3d6dbccd7c4 (patch) | |
| tree | d2011b76c57899a48bfb716d6097dcc660c478aa /llvm/lib | |
| parent | c2078fb1c8b63557d3b78af4b8897c617b1804df (diff) | |
| download | bcm5719-llvm-bcb34ac2ea6c7232a86f6da569afe3d6dbccd7c4.tar.gz bcm5719-llvm-bcb34ac2ea6c7232a86f6da569afe3d6dbccd7c4.zip | |
[AMDGPU] combine extractelement into several selects
An extractelement with non-constant index will be lowered either to
scratch or movrel loop in most cases. This patch converts such
instruction into a set of selects if vector size is not too big.
Differential Revision: https://reviews.llvm.org/D54351
llvm-svn: 346800
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 | 
1 files changed, 26 insertions, 4 deletions
| diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 254f1362f1f..39e5948eca7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8025,7 +8025,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine(      switch(Opc) {      default: -      return SDValue(); +      break;        // TODO: Support other binary operations.      case ISD::FADD:      case ISD::FSUB: @@ -8051,12 +8051,34 @@ SDValue SITargetLowering::performExtractVectorEltCombine(      }    } -  if (!DCI.isBeforeLegalize()) -    return SDValue(); -    unsigned VecSize = VecVT.getSizeInBits();    unsigned EltSize = EltVT.getSizeInBits(); +  // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) +  // This elminates non-constant index and subsequent movrel or scratch access. +  // Sub-dword vectors of size 2 dword or less have better implementation. +  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 +  // instructions. +  if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) && +      !isa<ConstantSDNode>(N->getOperand(1))) { +    SDLoc SL(N); +    SDValue Idx = N->getOperand(1); +    EVT IdxVT = Idx.getValueType(); +    SDValue V; +    for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { +      SDValue IC = DAG.getConstant(I, SL, IdxVT); +      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); +      if (I == 0) +        V = Elt; +      else +        V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); +    } +    return V; +  } + +  if (!DCI.isBeforeLegalize()) +    return SDValue(); +    // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit    // elements. This exposes more load reduction opportunities by replacing    // multiple small extract_vector_elements with a single 32-bit extract. | 

