diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 |
1 files changed, 26 insertions, 4 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 254f1362f1f..39e5948eca7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8025,7 +8025,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine( switch(Opc) { default: - return SDValue(); + break; // TODO: Support other binary operations. case ISD::FADD: case ISD::FSUB: @@ -8051,12 +8051,34 @@ SDValue SITargetLowering::performExtractVectorEltCombine( } } - if (!DCI.isBeforeLegalize()) - return SDValue(); - unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); + // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) + // This elminates non-constant index and subsequent movrel or scratch access. + // Sub-dword vectors of size 2 dword or less have better implementation. + // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 + // instructions. + if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) && + !isa<ConstantSDNode>(N->getOperand(1))) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + EVT IdxVT = Idx.getValueType(); + SDValue V; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { + SDValue IC = DAG.getConstant(I, SL, IdxVT); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); + if (I == 0) + V = Elt; + else + V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); + } + return V; + } + + if (!DCI.isBeforeLegalize()) + return SDValue(); + // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit // elements. This exposes more load reduction opportunities by replacing // multiple small extract_vector_elements with a single 32-bit extract. |