diff options
author | Sanjay Patel <spatel@rotateright.com> | 2015-01-22 18:21:26 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2015-01-22 18:21:26 +0000 |
commit | 37c41c1d2c787ebc51fe40d1cc0d14e57f4b648d (patch) | |
tree | 97801edd5ab6581263fe97b02dfdd6793e90e7d6 | |
parent | 74e82fa4f3c07aa388dfe41ebc300aeeaeb5a9cb (diff) | |
download | bcm5719-llvm-37c41c1d2c787ebc51fe40d1cc0d14e57f4b648d.tar.gz bcm5719-llvm-37c41c1d2c787ebc51fe40d1cc0d14e57f4b648d.zip |
merge consecutive stores of extracted vector elements (PR21711)
This is a 2nd try at the same optimization as http://reviews.llvm.org/D6698.
That patch was checked in at r224611, but reverted at r225031 because it
caused a failure outside of the regression tests.
The cause of the crash was not recognizing consecutive stores that have mixed
source values (loads and vector element extracts), so this patch adds a check
to bail out if any store value is not coming from a vector element extract.
This patch also refactors the shared logic of the constant source and vector
extracted elements source cases into a helper function.
Differential Revision: http://reviews.llvm.org/D6850
llvm-svn: 226845
-rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 254 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/MergeConsecutiveStores.ll | 59 |
2 files changed, 221 insertions, 92 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 72f458118da..466e3606070 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -363,6 +363,28 @@ namespace { /// chain (aliasing node.) SDValue FindBetterChain(SDNode *N, SDValue Chain); + /// Holds a pointer to an LSBaseSDNode as well as information on where it + /// is located in a sequence of memory operations connected by a chain. + struct MemOpLink { + MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): + MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } + // Ptr to the mem node. + LSBaseSDNode *MemNode; + // Offset from the base ptr. + int64_t OffsetFromBase; + // What is the sequence number of this mem node. + // Lowest mem operand in the DAG starts at zero. + unsigned SequenceNum; + }; + + /// This is a helper function for MergeConsecutiveStores. When the source + /// elements of the consecutive stores are all constants or all extracted + /// vector elements, try to merge them into one larger store. + /// \return True if a merged store was created. + bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes, + EVT MemVT, unsigned NumElem, + bool IsConstantSrc, bool UseVector); + /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return True if some memory operations were changed. @@ -9706,19 +9728,116 @@ struct BaseIndexOffset { } }; -/// Holds a pointer to an LSBaseSDNode as well as information on where it -/// is located in a sequence of memory operations connected by a chain. -struct MemOpLink { - MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq): - MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { } - // Ptr to the mem node. - LSBaseSDNode *MemNode; - // Offset from the base ptr. - int64_t OffsetFromBase; - // What is the sequence number of this mem node. - // Lowest mem operand in the DAG starts at zero. - unsigned SequenceNum; -}; +bool DAGCombiner::MergeStoresOfConstantsOrVecElts( + SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, + unsigned NumElem, bool IsConstantSrc, bool UseVector) { + // Make sure we have something to merge. + if (NumElem < 2) + return false; + + int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8; + LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; + unsigned EarliestNodeUsed = 0; + + for (unsigned i=0; i < NumElem; ++i) { + // Find a chain for the new wide-store operand. Notice that some + // of the store nodes that we found may not be selected for inclusion + // in the wide store. The chain we use needs to be the chain of the + // earliest store node which is *used* and replaced by the wide store. + if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum) + EarliestNodeUsed = i; + } + + // The earliest Node in the DAG. + LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode; + SDLoc DL(StoreNodes[0].MemNode); + + SDValue StoredVal; + if (UseVector) { + // Find a legal type for the vector store. + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); + assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); + if (IsConstantSrc) { + // A vector store with a constant source implies that the constant is + // zero; we only handle merging stores of constant zeros because the zero + // can be materialized without a load. + // It may be beneficial to loosen this restriction to allow non-zero + // store merging. + StoredVal = DAG.getConstant(0, Ty); + } else { + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElem ; ++i) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); + SDValue Val = St->getValue(); + // All of the operands of a BUILD_VECTOR must have the same type. + if (Val.getValueType() != MemVT) + return false; + Ops.push_back(Val); + } + + // Build the extracted vector elements back into a vector. + StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops); + } + } else { + // We should always use a vector store when merging extracted vector + // elements, so this path implies a store of constants. + assert(IsConstantSrc && "Merged vector elements should use vector store"); + + unsigned StoreBW = NumElem * ElementSizeBytes * 8; + APInt StoreInt(StoreBW, 0); + + // Construct a single integer constant which is made of the smaller + // constant inputs. + bool IsLE = TLI.isLittleEndian(); + for (unsigned i = 0; i < NumElem ; ++i) { + unsigned Idx = IsLE ? (NumElem - 1 - i) : i; + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); + SDValue Val = St->getValue(); + StoreInt <<= ElementSizeBytes*8; + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { + StoreInt |= C->getAPIntValue().zext(StoreBW); + } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) { + StoreInt |= C->getValueAPF().bitcastToAPInt().zext(StoreBW); + } else { + llvm_unreachable("Invalid constant element type"); + } + } + + // Create the new Load and Store operations. + EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); + StoredVal = DAG.getConstant(StoreInt, StoreTy); + } + + SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal, + FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), + false, false, + FirstInChain->getAlignment()); + + // Replace the first store with the new store + CombineTo(EarliestOp, NewStore); + // Erase all other stores. + for (unsigned i = 0; i < NumElem ; ++i) { + if (StoreNodes[i].MemNode == EarliestOp) + continue; + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); + // ReplaceAllUsesWith will replace all uses that existed when it was + // called, but graph optimizations may cause new ones to appear. For + // example, the case in pr14333 looks like + // + // St's chain -> St -> another store -> X + // + // And the only difference from St to the other store is the chain. + // When we change it's chain to be St's chain they become identical, + // get CSEed and the net result is that X is now a use of St. + // Since we know that St is redundant, just iterate. + while (!St->use_empty()) + DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); + deleteAndRecombine(St); + } + + return true; +} bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { EVT MemVT = St->getMemoryVT(); @@ -9731,11 +9850,14 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { return false; // Perform an early exit check. Do not bother looking at stored values that - // are not constants or loads. + // are not constants, loads, or extracted vector elements. SDValue StoredVal = St->getValue(); bool IsLoadSrc = isa<LoadSDNode>(StoredVal); - if (!isa<ConstantSDNode>(StoredVal) && !isa<ConstantFPSDNode>(StoredVal) && - !IsLoadSrc) + bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) || + isa<ConstantFPSDNode>(StoredVal); + bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT); + + if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc) return false; // Only look at ends of store sequences. @@ -9877,7 +9999,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; // Store the constants into memory as one consecutive store. - if (!IsLoadSrc) { + if (IsConstantSrc) { unsigned LastLegalType = 0; unsigned LastLegalVectorType = 0; bool NonZero = false; @@ -9926,85 +10048,33 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType; - // Make sure we have something to merge. - if (NumElem < 2) - return false; - - unsigned EarliestNodeUsed = 0; - for (unsigned i=0; i < NumElem; ++i) { - // Find a chain for the new wide-store operand. Notice that some - // of the store nodes that we found may not be selected for inclusion - // in the wide store. The chain we use needs to be the chain of the - // earliest store node which is *used* and replaced by the wide store. - if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum) - EarliestNodeUsed = i; - } - - // The earliest Node in the DAG. - LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode; - SDLoc DL(StoreNodes[0].MemNode); + return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, + true, UseVector); + } - SDValue StoredVal; - if (UseVector) { + // When extracting multiple vector elements, try to store them + // in one vector store rather than a sequence of scalar stores. + if (IsExtractVecEltSrc) { + unsigned NumElem = 0; + for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); + SDValue StoredVal = St->getValue(); + // This restriction could be loosened. + // Bail out if any stored values are not elements extracted from a vector. + // It should be possible to handle mixed sources, but load sources need + // more careful handling (see the block of code below that handles + // consecutive loads). + if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + // Find a legal type for the vector store. - EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); - assert(TLI.isTypeLegal(Ty) && "Illegal vector store"); - StoredVal = DAG.getConstant(0, Ty); - } else { - unsigned StoreBW = NumElem * ElementSizeBytes * 8; - APInt StoreInt(StoreBW, 0); - - // Construct a single integer constant which is made of the smaller - // constant inputs. - bool IsLE = TLI.isLittleEndian(); - for (unsigned i = 0; i < NumElem ; ++i) { - unsigned Idx = IsLE ?(NumElem - 1 - i) : i; - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode); - SDValue Val = St->getValue(); - StoreInt<<=ElementSizeBytes*8; - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) { - StoreInt|=C->getAPIntValue().zext(StoreBW); - } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) { - StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW); - } else { - llvm_unreachable("Invalid constant element type"); - } - } - - // Create the new Load and Store operations. - EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW); - StoredVal = DAG.getConstant(StoreInt, StoreTy); - } - - SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal, - FirstInChain->getBasePtr(), - FirstInChain->getPointerInfo(), - false, false, - FirstInChain->getAlignment()); - - // Replace the first store with the new store - CombineTo(EarliestOp, NewStore); - // Erase all other stores. - for (unsigned i = 0; i < NumElem ; ++i) { - if (StoreNodes[i].MemNode == EarliestOp) - continue; - StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode); - // ReplaceAllUsesWith will replace all uses that existed when it was - // called, but graph optimizations may cause new ones to appear. For - // example, the case in pr14333 looks like - // - // St's chain -> St -> another store -> X - // - // And the only difference from St to the other store is the chain. - // When we change it's chain to be St's chain they become identical, - // get CSEed and the net result is that X is now a use of St. - // Since we know that St is redundant, just iterate. - while (!St->use_empty()) - DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain()); - deleteAndRecombine(St); + EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1); + if (TLI.isTypeLegal(Ty)) + NumElem = i + 1; } - return true; + return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, + false, true); } // Below we handle the case of multiple consecutive stores that diff --git a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll index dfdaea523fd..f396e88f54c 100644 --- a/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/llvm/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -434,3 +434,62 @@ define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) { ; <label>:14 ret void } + +; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 ) +define void @merge_vec_element_store(<8 x float> %v, float* %ptr) { + %vecext0 = extractelement <8 x float> %v, i32 0 + %vecext1 = extractelement <8 x float> %v, i32 1 + %vecext2 = extractelement <8 x float> %v, i32 2 + %vecext3 = extractelement <8 x float> %v, i32 3 + %vecext4 = extractelement <8 x float> %v, i32 4 + %vecext5 = extractelement <8 x float> %v, i32 5 + %vecext6 = extractelement <8 x float> %v, i32 6 + %vecext7 = extractelement <8 x float> %v, i32 7 + %arrayidx1 = getelementptr inbounds float* %ptr, i64 1 + %arrayidx2 = getelementptr inbounds float* %ptr, i64 2 + %arrayidx3 = getelementptr inbounds float* %ptr, i64 3 + %arrayidx4 = getelementptr inbounds float* %ptr, i64 4 + %arrayidx5 = getelementptr inbounds float* %ptr, i64 5 + %arrayidx6 = getelementptr inbounds float* %ptr, i64 6 + %arrayidx7 = getelementptr inbounds float* %ptr, i64 7 + store float %vecext0, float* %ptr, align 4 + store float %vecext1, float* %arrayidx1, align 4 + store float %vecext2, float* %arrayidx2, align 4 + store float %vecext3, float* %arrayidx3, align 4 + store float %vecext4, float* %arrayidx4, align 4 + store float %vecext5, float* %arrayidx5, align 4 + store float %vecext6, float* %arrayidx6, align 4 + store float %vecext7, float* %arrayidx7, align 4 + ret void + +; CHECK-LABEL: merge_vec_element_store +; CHECK: vmovups +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; This is a minimized test based on real code that was failing. +; We could merge stores (and loads) like this... + +define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { + %idx0 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 0 + %idx1 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 1 + %idx4 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 4 + %idx5 = getelementptr inbounds [6 x i64]* %array, i64 0, i64 5 + + %a0 = load i64* %idx0, align 8 + store i64 %a0, i64* %idx4, align 8 + + %b = bitcast i64* %idx1 to <2 x i64>* + %v = load <2 x i64>* %b, align 8 + %a1 = extractelement <2 x i64> %v, i32 0 + store i64 %a1, i64* %idx5, align 8 + ret void + +; CHECK-LABEL: merge_vec_element_and_scalar_load +; CHECK: movq (%rdi), %rax +; CHECK-NEXT: movq %rax, 32(%rdi) +; CHECK-NEXT: movq 8(%rdi), %rax +; CHECK-NEXT: movq %rax, 40(%rdi) +; CHECK-NEXT: retq +} |