diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-06-15 15:31:36 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2018-06-15 15:31:36 +0000 |
| commit | 63bc0e3cb9a68551b5d2d8c5a0927864204bd345 (patch) | |
| tree | 5f496a546a86aa4dc1b37b3b4781617620b505f1 /llvm/lib/Target/AMDGPU | |
| parent | 02dc7e19e2910b2b3a963105684af0f830885360 (diff) | |
| download | bcm5719-llvm-63bc0e3cb9a68551b5d2d8c5a0927864204bd345.tar.gz bcm5719-llvm-63bc0e3cb9a68551b5d2d8c5a0927864204bd345.zip | |
AMDGPU: Add combine for short vector extract_vector_elts
Try to access pieces 4 bytes at a time. This helps
various hasOneUse extract_vector_elt combines, such
as load width reductions.
Avoids test regressions in a future commit.
llvm-svn: 334836
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 43 |
1 files changed, 42 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1b91d743641..69401f455b4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7097,8 +7097,11 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, SDValue SITargetLowering::performExtractVectorEltCombine( SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); - SelectionDAG &DAG = DCI.DAG; + + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) { SDLoc SL(N); @@ -7139,6 +7142,44 @@ SDValue SITargetLowering::performExtractVectorEltCombine( Vec.getOperand(1), Idx)); } } + + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + + // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit + // elements. This exposes more load reduction opportunities by replacing + // multiple small extract_vector_elements with a single 32-bit extract. + auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (EltSize <= 16 && + EltVT.isByteSized() && + VecSize > 32 && + VecSize % 32 == 0 && + Idx) { + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); + + unsigned BitIndex = Idx->getZExtValue() * EltSize; + unsigned EltIdx = BitIndex / 32; + unsigned LeftoverBitIdx = BitIndex % 32; + SDLoc SL(N); + + SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); + DCI.AddToWorklist(Cast.getNode()); + + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, + DAG.getConstant(EltIdx, SL, MVT::i32)); + DCI.AddToWorklist(Elt.getNode()); + SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, + DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); + DCI.AddToWorklist(Srl.getNode()); + + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl); + DCI.AddToWorklist(Trunc.getNode()); + return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc); + } + return SDValue(); } |

