summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp486
-rw-r--r--llvm/test/Transforms/SROA/basictest.ll56
-rw-r--r--llvm/test/Transforms/SROA/big-endian.ll59
3 files changed, 520 insertions, 81 deletions
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 688755f0704..48f38f7f473 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -237,6 +237,24 @@ public:
const_iterator end() const { return Slices.end(); }
/// @}
+ /// \brief Erase a range of slices.
+ void erase(iterator Start, iterator Stop) {
+ Slices.erase(Start, Stop);
+ }
+
+ /// \brief Insert new slices for this alloca.
+ ///
+ /// This moves the slices into the alloca's slices collection, and re-sorts
+ /// everything so that the usual ordering properties of the alloca's slices
+ /// hold.
+ void insert(ArrayRef<Slice> NewSlices) {
+ int OldSize = Slices.size();
+ std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices));
+ auto SliceI = Slices.begin() + OldSize;
+ std::sort(SliceI, Slices.end());
+ std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+ }
+
// Forward declare an iterator to befriend it.
class partition_iterator;
@@ -1022,6 +1040,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
void AllocaSlices::print(raw_ostream &OS, const_iterator I,
StringRef Indent) const {
printSlice(OS, I, Indent);
+ OS << "\n";
printUse(OS, I, Indent);
}
@@ -1029,7 +1048,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
StringRef Indent) const {
OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
<< " slice #" << (I - begin())
- << (I->isSplittable() ? " (splittable)" : "") << "\n";
+ << (I->isSplittable() ? " (splittable)" : "");
}
void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
@@ -1245,6 +1264,7 @@ private:
friend class PHIOrSelectSpeculator;
friend class AllocaSliceRewriter;
+ bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
bool rewritePartition(AllocaInst &AI, AllocaSlices &AS,
AllocaSlices::Partition &P);
bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
@@ -1794,6 +1814,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
return Ptr;
}
+/// \brief Compute the adjusted alignment for a load or store from an offset.
+static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
+ const DataLayout &DL) {
+ unsigned Alignment;
+ Type *Ty;
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ Alignment = LI->getAlignment();
+ Ty = LI->getType();
+ } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+ Alignment = SI->getAlignment();
+ Ty = SI->getValueOperand()->getType();
+ } else {
+ llvm_unreachable("Only loads and stores are allowed!");
+ }
+
+ if (!Alignment)
+ Alignment = DL.getABITypeAlignment(Ty);
+
+ return MinAlign(Alignment, Offset);
+}
+
/// \brief Test whether we can convert a value from the old to the new type.
///
/// This predicate should be used to guard calls to convertValue in order to
@@ -2412,6 +2453,7 @@ public:
BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
DEBUG(AS.printSlice(dbgs(), I, ""));
+ DEBUG(dbgs() << "\n");
// Compute the intersecting offset range.
assert(BeginOffset < NewAllocaEndOffset);
@@ -3426,6 +3468,444 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
return SubTy;
}
+/// \brief Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+/// %a = alloca [12 x i8]
+/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+/// %iptr1 = bitcast i8* %gep1 to i64*
+/// %iptr2 = bitcast i8* %gep2 to i64*
+/// %fptr1 = bitcast i8* %gep1 to float*
+/// %fptr2 = bitcast i8* %gep2 to float*
+/// %fptr3 = bitcast i8* %gep3 to float*
+/// store float 0.0, float* %fptr1
+/// store float 1.0, float* %fptr2
+/// %v = load i64* %iptr1
+/// store i64 %v, i64* %iptr2
+/// %f1 = load float* %fptr2
+/// %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+ DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+ // Track the loads and stores which are candidates for pre-splitting here, in
+ // the order they first appear during the partition scan. These give stable
+ // iteration order and a basis for tracking which loads and stores we
+ // actually split.
+ SmallVector<LoadInst *, 4> Loads;
+ SmallVector<StoreInst *, 4> Stores;
+
+ // We need to accumulate the splits required of each load or store where we
+ // can find them via a direct lookup. This is important to cross-check loads
+ // and stores against each other. We also track the slice so that we can kill
+ // all the slices that end up split.
+ struct SplitOffsets {
+ Slice *S;
+ std::vector<uint64_t> Splits;
+ };
+ SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+ DEBUG(dbgs() << " Searching for candidate loads and stores\n");
+ for (auto &P : AS.partitions()) {
+ for (Slice &S : P) {
+ if (!S.isSplittable())
+ continue;
+ if (S.endOffset() <= P.endOffset())
+ continue;
+ assert(P.endOffset() > S.beginOffset() &&
+ "Empty or backwards partition!");
+
+ // Determine if this is a pre-splittable slice.
+ Instruction *I = cast<Instruction>(S.getUse()->getUser());
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+ // The load must be used exclusively to store into other pointers for
+ // us to be able to arbitrarily pre-split it. The stores must also be
+ // simple to avoid changing semantics.
+ auto IsLoadSimplyStored = [](LoadInst *LI) {
+ for (User *LU : LI->users()) {
+ auto *SI = dyn_cast<StoreInst>(LU);
+ if (!SI || !SI->isSimple())
+ return false;
+ }
+ return true;
+ };
+ if (!IsLoadSimplyStored(LI))
+ continue;
+
+ Loads.push_back(LI);
+ } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
+ if (!SI || S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+ continue;
+ auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+ if (!StoredLoad || !StoredLoad->isSimple())
+ continue;
+ assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+ Stores.push_back(SI);
+ } else {
+ // Other uses cannot be pre-split.
+ continue;
+ }
+
+ // Record the initial split.
+ DEBUG(dbgs() << " Candidate: " << *I << "\n");
+ auto &Offsets = SplitOffsetsMap[I];
+ assert(Offsets.Splits.empty() &&
+ "Should not have splits the first time we see an instruction!");
+ Offsets.S = &S;
+ Offsets.Splits.push_back(P.endOffset());
+ }
+
+ // Now scan the already split slices, and add a split for any of them which
+ // we're going to pre-split.
+ for (Slice *S : P.splitSliceTails()) {
+ auto SplitOffsetsMapI =
+ SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+ if (SplitOffsetsMapI == SplitOffsetsMap.end())
+ continue;
+ auto &Offsets = SplitOffsetsMapI->second;
+
+ assert(Offsets.S == S && "Found a mismatched slice!");
+ assert(!Offsets.Splits.empty() &&
+ "Cannot have an empty set of splits on the second partition!");
+ assert(Offsets.Splits.back() == P.beginOffset() &&
+ "Previous split does not end where this one begins!");
+
+ // Record each split. The last partition's end isn't needed as the size
+ // of the slice dictates that.
+ if (S->endOffset() > P.endOffset())
+ Offsets.Splits.push_back(P.endOffset());
+ }
+ }
+
+ // We may have split loads where some of their stores are split stores. For
+ // such loads and stores, we can only pre-split them if their splits exactly
+ // match relative to their starting offset. We have to verify this prior to
+ // any rewriting.
+ SmallPtrSet<LoadInst *, 4> BadSplitLoads;
+ Stores.erase(
+ std::remove_if(
+ Stores.begin(), Stores.end(),
+ [&BadSplitLoads, &SplitOffsetsMap](
+ StoreInst *SI) {
+ // Lookup the load we are storing in our map of split offsets.
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+ if (LoadOffsetsI == SplitOffsetsMap.end())
+ return false; // Unrelated loads are always safe.
+ auto &LoadOffsets = LoadOffsetsI->second;
+
+ // Now lookup the store's offsets.
+ auto &StoreOffsets = SplitOffsetsMap[SI];
+
+ // If the relative offsets of each split in the load and store
+ // match exactly, then we can split them and we don't need to
+ // remove them here.
+ if (LoadOffsets.Splits == StoreOffsets.Splits)
+ return false;
+
+ DEBUG(dbgs() << " Mismatched splits for load and store:\n"
+ << " " << *LI << "\n"
+ << " " << *SI << "\n");
+
+ // We've found a store and load that we need to split with
+ // mismatched relative splits. Just give up on them and remove both
+ // instructions from our list of candidates.
+ BadSplitLoads.insert(LI);
+ return true;
+ }),
+ Stores.end());
+ Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
+ [&BadSplitLoads](LoadInst *LI) {
+ return BadSplitLoads.count(LI);
+ }),
+ Loads.end());
+
+ // If no loads or stores are left, there is no pre-splitting to be done for
+ // this alloca.
+ if (Loads.empty() && Stores.empty())
+ return false;
+
+ // From here on, we can't fail and will be building new accesses, so rig up
+ // an IR builder.
+ IRBuilderTy IRB(&AI);
+
+ // Collect the new slices which we will merge into the alloca slices.
+ SmallVector<Slice, 4> NewSlices;
+
+ // Track any allocas we end up splitting loads and stores for so we iterate
+ // on them.
+ SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+ // At this point, we have collected all of the loads and stores we can
+ // pre-split, and the specific splits needed for them. We actually do the
+ // splitting in a specific order in order to handle when one of the loads in
+ // the value operand to one of the stores.
+ //
+ // First, we rewrite all of the split loads, and just accumulate each split
+ // load in a parallel structure. We also build the slices for them and append
+ // them to the alloca slices.
+ SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+ std::vector<LoadInst *> SplitLoads;
+ for (LoadInst *LI : Loads) {
+ SplitLoads.clear();
+
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t LoadSize = Ty->getBitWidth() / 8;
+ assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+ auto &Offsets = SplitOffsetsMap[LI];
+ assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + LoadSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+ IRB.SetInsertPoint(BasicBlock::iterator(LI));
+
+ DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+ LoadInst *PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, *DL, BasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset), PartPtrTy,
+ BasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+ LI->getName());
+
+ // Append this load onto the list of split loads so we can find it later
+ // to rewrite the stores.
+ SplitLoads.push_back(PLoad);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(Slice(BaseOffset + PartOffset,
+ BaseOffset + PartOffset + PartSize,
+ &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+ /*IsSplittable*/ true));
+ DEBUG(AS.printSlice(dbgs(), std::prev(AS.end()), " "));
+ DEBUG(dbgs() << ": " << *PLoad << "\n");
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ if (Idx > Size)
+ break;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+ }
+
+ // Now that we have the split loads, do the slow walk over all uses of the
+ // load and rewrite them as split stores, or save the split loads to use
+ // below if the store is going to be split there anyways.
+ bool DeferredStores = false;
+ for (User *LU : LI->users()) {
+ StoreInst *SI = cast<StoreInst>(LU);
+ if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+ DeferredStores = true;
+ DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n");
+ continue;
+ }
+
+ Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+ IRB.SetInsertPoint(BasicBlock::iterator(SI));
+
+ DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
+
+ for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+ LoadInst *PLoad = SplitLoads[Idx];
+ uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+ auto *PartPtrTy = PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+ (void)PStore;
+ DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this store, and we have to track any promotable alloca (indicated by
+ // a direct store) as needing to be resplit because it is no longer
+ // promotable.
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ StoreBasePtr->stripInBoundsOffsets())) {
+ Worklist.insert(OtherAI);
+ }
+
+ // Mark the original store as dead.
+ DeadInsts.insert(SI);
+ }
+
+ // Save the split loads if there are deferred stores among the users.
+ if (DeferredStores)
+ SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+ // Mark the original load as dead and kill the original slice.
+ DeadInsts.insert(LI);
+ Offsets.S->kill();
+ }
+
+ // Second, we rewrite all of the split stores. At this point, we know that
+ // all loads from this alloca have been split already. For stores of such
+ // loads, we can simply look up the pre-existing split loads. For stores of
+ // other loads, we split those loads first and then write split stores of
+ // them.
+ for (StoreInst *SI : Stores) {
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t StoreSize = Ty->getBitWidth() / 8;
+ assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+ auto &Offsets = SplitOffsetsMap[SI];
+ assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + StoreSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Instruction *LoadBasePtr = cast<Instruction>(LI->getPointerOperand());
+ Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+ DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
+
+ // Check whether we have an already split load.
+ auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+ std::vector<LoadInst *> *SplitLoads = nullptr;
+ if (SplitLoadsMapI != SplitLoadsMap.end()) {
+ SplitLoads = &SplitLoadsMapI->second;
+ assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+ "Too few split loads for the number of splits in the store!");
+ } else {
+ DEBUG(dbgs() << " of load: " << *LI << "\n");
+ }
+
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+ // Either lookup a split load or create one.
+ LoadInst *PLoad;
+ if (SplitLoads) {
+ PLoad = (*SplitLoads)[Idx];
+ } else {
+ IRB.SetInsertPoint(BasicBlock::iterator(LI));
+ PLoad = IRB.CreateAlignedLoad(
+ getAdjustedPtr(IRB, *DL, LoadBasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, LoadBasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+ LI->getName());
+ }
+
+ // And store this partition.
+ IRB.SetInsertPoint(BasicBlock::iterator(SI));
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+ APInt(DL->getPointerSizeInBits(), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+ /*IsSplittable*/ true));
+ DEBUG(AS.printSlice(dbgs(), std::prev(AS.end()), " "));
+ DEBUG(dbgs() << ": " << *PStore << "\n");
+ if (!SplitLoads) {
+ DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
+ }
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ if (Idx > Size)
+ break;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this load, which is only relevant if it isn't a load of this alloca and
+ // thus we didn't already split the loads above. We also have to keep track
+ // of any promotable allocas we split loads on as they can no longer be
+ // promoted.
+ if (!SplitLoads) {
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ LoadBasePtr->stripInBoundsOffsets())) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ Worklist.insert(OtherAI);
+ }
+ }
+
+ // Mark the original store as dead now that we've split it up and kill its
+ // slice. Note that we leave the original load in place. It may in turn be
+ // split up if it is an alloca load for some other alloca, but it may be
+ // a normal load. This may introduce redundant loads, but where those can
+ // be merged the rest of the optimizer should handle the merging, and this
+ // uncovers SSA splits which is more important. In practice, the original
+ // loads will almost always be fully split and removed eventually, and the
+ // splits will be merged by any trivial CSE, including instcombine.
+ DeadInsts.insert(SI);
+ Offsets.S->kill();
+ }
+
+ // Now we need to remove the killed slices, sort the newly added slices, and
+ // merge the two sorted ranges of slices so that the entire range is sorted
+ // properly for us to re-compute the partitions.
+ AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
+ return S.isDead();
+ }), AS.end());
+
+ AS.insert(NewSlices);
+
+ DEBUG(dbgs() << " Pre-split slices:\n");
+#ifndef NDEBUG
+ for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+ DEBUG(AS.print(dbgs(), I, " "));
+#endif
+
+ // Finally, don't try to promote any allocas that new require re-splitting.
+ // They have already been added to the worklist above.
+ PromotableAllocas.erase(
+ std::remove_if(
+ PromotableAllocas.begin(), PromotableAllocas.end(),
+ [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
+ PromotableAllocas.end());
+
+ return true;
+}
+
/// \brief Rewrite an alloca partition's users.
///
/// This routine drives both of the rewriting goals of the SROA pass. It tries
@@ -3582,7 +4062,9 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
unsigned NumPartitions = 0;
bool Changed = false;
- // Rewrite each parttion.
+ Changed |= presplitLoadsAndStores(AI, AS);
+
+ // Rewrite each partition.
for (auto &P : AS.partitions()) {
Changed |= rewritePartition(AI, AS, P);
++NumPartitions;
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index dc2b16550a0..914095dc383 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -572,8 +572,7 @@ bad:
}
define i8 @test12() {
-; We fully promote these to the i24 load or store size, resulting in just masks
-; and other operations that instcombine will fold, but no alloca.
+; We promote these to three SSA values which fold away immediately.
;
; CHECK-LABEL: @test12(
@@ -592,17 +591,6 @@ entry:
%ai = load i24* %aiptr
; CHECK-NOT: store
; CHECK-NOT: load
-; CHECK: %[[ext2:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
-; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
-; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]]
-; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
-; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
-; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
-; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256
-; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]]
%biptr = bitcast [3 x i8]* %b to i24*
store i24 %ai, i24* %biptr
@@ -614,17 +602,12 @@ entry:
%b2 = load i8* %b2ptr
; CHECK-NOT: store
; CHECK-NOT: load
-; CHECK: %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
-; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
-; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
-; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16
-; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
%bsum0 = add i8 %b0, %b1
%bsum1 = add i8 %bsum0, %b2
ret i8 %bsum1
-; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
-; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK: %[[sum0:.*]] = add i8 0, 0
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], 0
; CHECK-NEXT: ret i8 %[[sum1]]
}
@@ -1440,3 +1423,36 @@ entry:
ret void
}
+define float @test25() {
+; Check that we split up stores in order to promote the smaller SSA values.. These types
+; of patterns can arise because LLVM maps small memcpy's to integer load and
+; stores. If we get a memcpy of an aggregate (such as C and C++ frontends would
+; produce, but so might any language frontend), this will in many cases turn into
+; an integer load and store. SROA needs to be extremely powerful to correctly
+; handle these cases and form splitable and promotable SSA values.
+;
+; CHECK-LABEL: @test25(
+; CHECK-NOT: alloca
+; CHECK: %[[F1:.*]] = bitcast i32 0 to float
+; CHECK: %[[F2:.*]] = bitcast i32 1065353216 to float
+; CHECK: %[[SUM:.*]] = fadd float %[[F1]], %[[F2]]
+; CHECK: ret float %[[SUM]]
+
+entry:
+ %a = alloca i64
+ %b = alloca i64
+ %a.cast = bitcast i64* %a to [2 x float]*
+ %a.gep1 = getelementptr [2 x float]* %a.cast, i32 0, i32 0
+ %a.gep2 = getelementptr [2 x float]* %a.cast, i32 0, i32 1
+ %b.cast = bitcast i64* %b to [2 x float]*
+ %b.gep1 = getelementptr [2 x float]* %b.cast, i32 0, i32 0
+ %b.gep2 = getelementptr [2 x float]* %b.cast, i32 0, i32 1
+ store float 0.0, float* %a.gep1
+ store float 1.0, float* %a.gep2
+ %v = load i64* %a
+ store i64 %v, i64* %b
+ %f1 = load float* %b.gep1
+ %f2 = load float* %b.gep2
+ %ret = fadd float %f1, %f2
+ ret float %ret
+}
diff --git a/llvm/test/Transforms/SROA/big-endian.ll b/llvm/test/Transforms/SROA/big-endian.ll
index 9e87a9f073c..1ef074844d1 100644
--- a/llvm/test/Transforms/SROA/big-endian.ll
+++ b/llvm/test/Transforms/SROA/big-endian.ll
@@ -3,65 +3,6 @@
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
-define i8 @test1() {
-; We fully promote these to the i24 load or store size, resulting in just masks
-; and other operations that instcombine will fold, but no alloca. Note this is
-; the same as test12 in basictest.ll, but here we assert big-endian byte
-; ordering.
-;
-; CHECK-LABEL: @test1(
-
-entry:
- %a = alloca [3 x i8]
- %b = alloca [3 x i8]
-; CHECK-NOT: alloca
-
- %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
- store i8 0, i8* %a0ptr
- %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
- store i8 0, i8* %a1ptr
- %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
- store i8 0, i8* %a2ptr
- %aiptr = bitcast [3 x i8]* %a to i24*
- %ai = load i24* %aiptr
-; CHECK-NOT: store
-; CHECK-NOT: load
-; CHECK: %[[ext2:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
-; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
-; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
-; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
-; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
-; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
-; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
-; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
-
- %biptr = bitcast [3 x i8]* %b to i24*
- store i24 %ai, i24* %biptr
- %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
- %b0 = load i8* %b0ptr
- %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
- %b1 = load i8* %b1ptr
- %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
- %b2 = load i8* %b2ptr
-; CHECK-NOT: store
-; CHECK-NOT: load
-; CHECK: %[[shift0:.*]] = lshr i24 %[[insert0]], 16
-; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
-; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
-; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
-; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
-
- %bsum0 = add i8 %b0, %b1
- %bsum1 = add i8 %bsum0, %b2
- ret i8 %bsum1
-; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
-; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
-; CHECK-NEXT: ret i8 %[[sum1]]
-}
-
define i64 @test2() {
; Test for various mixed sizes of integer loads and stores all getting
; promoted.
OpenPOWER on IntegriCloud