3 files changed, 520 insertions, 81 deletions
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 688755f0704..48f38f7f473 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -237,6 +237,24 @@ public:
   const_iterator end() const { return Slices.end(); }
   /// @}
 
+  /// \brief Erase a range of slices.
+  void erase(iterator Start, iterator Stop) {
+    Slices.erase(Start, Stop);
+  }
+
+  /// \brief Insert new slices for this alloca.
+  ///
+  /// This moves the slices into the alloca's slices collection, and re-sorts
+  /// everything so that the usual ordering properties of the alloca's slices
+  /// hold.
+  void insert(ArrayRef<Slice> NewSlices) {
+    int OldSize = Slices.size();
+    std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices));
+    auto SliceI = Slices.begin() + OldSize;
+    std::sort(SliceI, Slices.end());
+    std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+  }
+
   // Forward declare an iterator to befriend it.
   class partition_iterator;
 
@@ -1022,6 +1040,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
                          StringRef Indent) const {
   printSlice(OS, I, Indent);
+  OS << "\n";
   printUse(OS, I, Indent);
 }
 
@@ -1029,7 +1048,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
                               StringRef Indent) const {
   OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
      << " slice #" << (I - begin())
-     << (I->isSplittable() ? " (splittable)" : "") << "\n";
+     << (I->isSplittable() ? " (splittable)" : "");
 }
 
 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
@@ -1245,6 +1264,7 @@ private:
   friend class PHIOrSelectSpeculator;
   friend class AllocaSliceRewriter;
 
+  bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
   bool rewritePartition(AllocaInst &AI, AllocaSlices &AS,
                         AllocaSlices::Partition &P);
   bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
@@ -1794,6 +1814,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
   return Ptr;
 }
 
+/// \brief Compute the adjusted alignment for a load or store from an offset.
+static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset,
+                                     const DataLayout &DL) {
+  unsigned Alignment;
+  Type *Ty;
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    Alignment = LI->getAlignment();
+    Ty = LI->getType();
+  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+    Alignment = SI->getAlignment();
+    Ty = SI->getValueOperand()->getType();
+  } else {
+    llvm_unreachable("Only loads and stores are allowed!");
+  }
+
+  if (!Alignment)
+    Alignment = DL.getABITypeAlignment(Ty);
+
+  return MinAlign(Alignment, Offset);
+}
+
 /// \brief Test whether we can convert a value from the old to the new type.
 ///
 /// This predicate should be used to guard calls to convertValue in order to
@@ -2412,6 +2453,7 @@ public:
         BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
     DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
     DEBUG(AS.printSlice(dbgs(), I, ""));
+    DEBUG(dbgs() << "\n");
 
     // Compute the intersecting offset range.
     assert(BeginOffset < NewAllocaEndOffset);
@@ -3426,6 +3468,444 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
   return SubTy;
 }
 
+/// \brief Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+///   %a = alloca [12 x i8]
+///   %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+///   %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+///   %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+///   %iptr1 = bitcast i8* %gep1 to i64*
+///   %iptr2 = bitcast i8* %gep2 to i64*
+///   %fptr1 = bitcast i8* %gep1 to float*
+///   %fptr2 = bitcast i8* %gep2 to float*
+///   %fptr3 = bitcast i8* %gep3 to float*
+///   store float 0.0, float* %fptr1
+///   store float 1.0, float* %fptr2
+///   %v = load i64* %iptr1
+///   store i64 %v, i64* %iptr2
+///   %f1 = load float* %fptr2
+///   %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+  DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+  // Track the loads and stores which are candidates for pre-splitting here, in
+  // the order they first appear during the partition scan. These give stable
+  // iteration order and a basis for tracking which loads and stores we
+  // actually split.
+  SmallVector<LoadInst *, 4> Loads;
+  SmallVector<StoreInst *, 4> Stores;
+
+  // We need to accumulate the splits required of each load or store where we
+  // can find them via a direct lookup. This is important to cross-check loads
+  // and stores against each other. We also track the slice so that we can kill
+  // all the slices that end up split.
+  struct SplitOffsets {
+    Slice *S;
+    std::vector<uint64_t> Splits;
+  };
+  SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+  DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
+  for (auto &P : AS.partitions()) {
+    for (Slice &S : P) {
+      if (!S.isSplittable())
+        continue;
+      if (S.endOffset() <= P.endOffset())
+        continue;
+      assert(P.endOffset() > S.beginOffset() &&
+             "Empty or backwards partition!");
+
+      // Determine if this is a pre-splittable slice.
+      Instruction *I = cast<Instruction>(S.getUse()->getUser());
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+        // The load must be used exclusively to store into other pointers for
+        // us to be able to arbitrarily pre-split it. The stores must also be
+        // simple to avoid changing semantics.
+        auto IsLoadSimplyStored = [](LoadInst *LI) {
+          for (User *LU : LI->users()) {
+            auto *SI = dyn_cast<StoreInst>(LU);
+            if (!SI || !SI->isSimple())
+              return false;
+          }
+          return true;
+        };
+        if (!IsLoadSimplyStored(LI))
+          continue;
+
+        Loads.push_back(LI);
+      } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) {
+      if (!SI || S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+        continue;
+      auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+      if (!StoredLoad || !StoredLoad->isSimple())
+        continue;
+      assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+      Stores.push_back(SI);
+      } else {
+        // Other uses cannot be pre-split.
+        continue;
+      }
+
+      // Record the initial split.
+      DEBUG(dbgs() << "    Candidate: " << *I << "\n");
+      auto &Offsets = SplitOffsetsMap[I];
+      assert(Offsets.Splits.empty() &&
+             "Should not have splits the first time we see an instruction!");
+      Offsets.S = &S;
+      Offsets.Splits.push_back(P.endOffset());
+    }
+
+    // Now scan the already split slices, and add a split for any of them which
+    // we're going to pre-split.
+    for (Slice *S : P.splitSliceTails()) {
+      auto SplitOffsetsMapI =
+          SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+      if (SplitOffsetsMapI == SplitOffsetsMap.end())
+        continue;
+      auto &Offsets = SplitOffsetsMapI->second;
+
+      assert(Offsets.S == S && "Found a mismatched slice!");
+      assert(!Offsets.Splits.empty() &&
+             "Cannot have an empty set of splits on the second partition!");
+      assert(Offsets.Splits.back() == P.beginOffset() &&
+             "Previous split does not end where this one begins!");
+
+      // Record each split. The last partition's end isn't needed as the size
+      // of the slice dictates that.
+      if (S->endOffset() > P.endOffset())
+        Offsets.Splits.push_back(P.endOffset());
+    }
+  }
+
+  // We may have split loads where some of their stores are split stores. For
+  // such loads and stores, we can only pre-split them if their splits exactly
+  // match relative to their starting offset. We have to verify this prior to
+  // any rewriting.
+  SmallPtrSet<LoadInst *, 4> BadSplitLoads;
+  Stores.erase(
+      std::remove_if(
+          Stores.begin(), Stores.end(),
+          [&BadSplitLoads, &SplitOffsetsMap](
+              StoreInst *SI) {
+            // Lookup the load we are storing in our map of split offsets.
+            auto *LI = cast<LoadInst>(SI->getValueOperand());
+            auto LoadOffsetsI = SplitOffsetsMap.find(LI);
+            if (LoadOffsetsI == SplitOffsetsMap.end())
+              return false; // Unrelated loads are always safe.
+            auto &LoadOffsets = LoadOffsetsI->second;
+
+            // Now lookup the store's offsets.
+            auto &StoreOffsets = SplitOffsetsMap[SI];
+
+            // If the relative offsets of each split in the load and store
+            // match exactly, then we can split them and we don't need to
+            // remove them here.
+            if (LoadOffsets.Splits == StoreOffsets.Splits)
+              return false;
+
+            DEBUG(dbgs() << "    Mismatched splits for load and store:\n"
+                         << "      " << *LI << "\n"
+                         << "      " << *SI << "\n");
+
+            // We've found a store and load that we need to split with
+            // mismatched relative splits. Just give up on them and remove both
+            // instructions from our list of candidates.
+            BadSplitLoads.insert(LI);
+            return true;
+          }),
+      Stores.end());
+  Loads.erase(std::remove_if(Loads.begin(), Loads.end(),
+                             [&BadSplitLoads](LoadInst *LI) {
+                               return BadSplitLoads.count(LI);
+                             }),
+              Loads.end());
+
+  // If no loads or stores are left, there is no pre-splitting to be done for
+  // this alloca.
+  if (Loads.empty() && Stores.empty())
+    return false;
+
+  // From here on, we can't fail and will be building new accesses, so rig up
+  // an IR builder.
+  IRBuilderTy IRB(&AI);
+
+  // Collect the new slices which we will merge into the alloca slices.
+  SmallVector<Slice, 4> NewSlices;
+
+  // Track any allocas we end up splitting loads and stores for so we iterate
+  // on them.
+  SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+  // At this point, we have collected all of the loads and stores we can
+  // pre-split, and the specific splits needed for them. We actually do the
+  // splitting in a specific order in order to handle when one of the loads in
+  // the value operand to one of the stores.
+  //
+  // First, we rewrite all of the split loads, and just accumulate each split
+  // load in a parallel structure. We also build the slices for them and append
+  // them to the alloca slices.
+  SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+  std::vector<LoadInst *> SplitLoads;
+  for (LoadInst *LI : Loads) {
+    SplitLoads.clear();
+
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t LoadSize = Ty->getBitWidth() / 8;
+    assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+    auto &Offsets = SplitOffsetsMap[LI];
+    assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + LoadSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+    IRB.SetInsertPoint(BasicBlock::iterator(LI));
+
+    DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      LoadInst *PLoad = IRB.CreateAlignedLoad(
+          getAdjustedPtr(IRB, *DL, BasePtr,
+                         APInt(DL->getPointerSizeInBits(), PartOffset), PartPtrTy,
+                         BasePtr->getName() + "."),
+          getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+          LI->getName());
+
+      // Append this load onto the list of split loads so we can find it later
+      // to rewrite the stores.
+      SplitLoads.push_back(PLoad);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(Slice(BaseOffset + PartOffset,
+                         BaseOffset + PartOffset + PartSize,
+                         &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+                         /*IsSplittable*/ true));
+      DEBUG(AS.printSlice(dbgs(), std::prev(AS.end()), "    "));
+      DEBUG(dbgs() << ": " << *PLoad << "\n");
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      if (Idx > Size)
+        break;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+    }
+
+    // Now that we have the split loads, do the slow walk over all uses of the
+    // load and rewrite them as split stores, or save the split loads to use
+    // below if the store is going to be split there anyways.
+    bool DeferredStores = false;
+    for (User *LU : LI->users()) {
+      StoreInst *SI = cast<StoreInst>(LU);
+      if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+        DeferredStores = true;
+        DEBUG(dbgs() << "    Deferred splitting of store: " << *SI << "\n");
+        continue;
+      }
+
+      Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+      IRB.SetInsertPoint(BasicBlock::iterator(SI));
+
+      DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
+
+      for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+        LoadInst *PLoad = SplitLoads[Idx];
+        uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+        auto *PartPtrTy = PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+        StoreInst *PStore = IRB.CreateAlignedStore(
+            PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+                                  APInt(DL->getPointerSizeInBits(), PartOffset),
+                                  PartPtrTy, StoreBasePtr->getName() + "."),
+            getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+        (void)PStore;
+        DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
+      }
+
+      // We want to immediately iterate on any allocas impacted by splitting
+      // this store, and we have to track any promotable alloca (indicated by
+      // a direct store) as needing to be resplit because it is no longer
+      // promotable.
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     StoreBasePtr->stripInBoundsOffsets())) {
+        Worklist.insert(OtherAI);
+      }
+
+      // Mark the original store as dead.
+      DeadInsts.insert(SI);
+    }
+
+    // Save the split loads if there are deferred stores among the users.
+    if (DeferredStores)
+      SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+    // Mark the original load as dead and kill the original slice.
+    DeadInsts.insert(LI);
+    Offsets.S->kill();
+  }
+
+  // Second, we rewrite all of the split stores. At this point, we know that
+  // all loads from this alloca have been split already. For stores of such
+  // loads, we can simply look up the pre-existing split loads. For stores of
+  // other loads, we split those loads first and then write split stores of
+  // them.
+  for (StoreInst *SI : Stores) {
+    auto *LI = cast<LoadInst>(SI->getValueOperand());
+    IntegerType *Ty = cast<IntegerType>(LI->getType());
+    uint64_t StoreSize = Ty->getBitWidth() / 8;
+    assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+    auto &Offsets = SplitOffsetsMap[SI];
+    assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+           "Slice size should always match load size exactly!");
+    uint64_t BaseOffset = Offsets.S->beginOffset();
+    assert(BaseOffset + StoreSize > BaseOffset &&
+           "Cannot represent alloca access size using 64-bit integers!");
+
+    Instruction *LoadBasePtr = cast<Instruction>(LI->getPointerOperand());
+    Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+    DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
+
+    // Check whether we have an already split load.
+    auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+    std::vector<LoadInst *> *SplitLoads = nullptr;
+    if (SplitLoadsMapI != SplitLoadsMap.end()) {
+      SplitLoads = &SplitLoadsMapI->second;
+      assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+             "Too few split loads for the number of splits in the store!");
+    } else {
+      DEBUG(dbgs() << "          of load: " << *LI << "\n");
+    }
+
+
+    uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+    int Idx = 0, Size = Offsets.Splits.size();
+    for (;;) {
+      auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+      auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+      // Either lookup a split load or create one.
+      LoadInst *PLoad;
+      if (SplitLoads) {
+        PLoad = (*SplitLoads)[Idx];
+      } else {
+        IRB.SetInsertPoint(BasicBlock::iterator(LI));
+        PLoad = IRB.CreateAlignedLoad(
+            getAdjustedPtr(IRB, *DL, LoadBasePtr,
+                           APInt(DL->getPointerSizeInBits(), PartOffset),
+                           PartPtrTy, LoadBasePtr->getName() + "."),
+            getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false,
+            LI->getName());
+      }
+
+      // And store this partition.
+      IRB.SetInsertPoint(BasicBlock::iterator(SI));
+      StoreInst *PStore = IRB.CreateAlignedStore(
+          PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr,
+                                APInt(DL->getPointerSizeInBits(), PartOffset),
+                                PartPtrTy, StoreBasePtr->getName() + "."),
+          getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false);
+
+      // Now build a new slice for the alloca.
+      NewSlices.push_back(
+          Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+                &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+                /*IsSplittable*/ true));
+      DEBUG(AS.printSlice(dbgs(), std::prev(AS.end()), "    "));
+      DEBUG(dbgs() << ": " << *PStore << "\n");
+      if (!SplitLoads) {
+        DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
+      }
+
+      // Setup the next partition.
+      PartOffset = Offsets.Splits[Idx];
+      ++Idx;
+      if (Idx > Size)
+        break;
+      PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+    }
+
+    // We want to immediately iterate on any allocas impacted by splitting
+    // this load, which is only relevant if it isn't a load of this alloca and
+    // thus we didn't already split the loads above. We also have to keep track
+    // of any promotable allocas we split loads on as they can no longer be
+    // promoted.
+    if (!SplitLoads) {
+      if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        ResplitPromotableAllocas.insert(OtherAI);
+        Worklist.insert(OtherAI);
+      } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+                     LoadBasePtr->stripInBoundsOffsets())) {
+        assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        Worklist.insert(OtherAI);
+      }
+    }
+
+    // Mark the original store as dead now that we've split it up and kill its
+    // slice. Note that we leave the original load in place. It may in turn be
+    // split up if it is an alloca load for some other alloca, but it may be
+    // a normal load. This may introduce redundant loads, but where those can
+    // be merged the rest of the optimizer should handle the merging, and this
+    // uncovers SSA splits which is more important. In practice, the original
+    // loads will almost always be fully split and removed eventually, and the
+    // splits will be merged by any trivial CSE, including instcombine.
+    DeadInsts.insert(SI);
+    Offsets.S->kill();
+  }
+
+  // Now we need to remove the killed slices, sort the newly added slices, and
+  // merge the two sorted ranges of slices so that the entire range is sorted
+  // properly for us to re-compute the partitions.
+  AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
+    return S.isDead();
+  }), AS.end());
+
+  AS.insert(NewSlices);
+
+  DEBUG(dbgs() << "  Pre-split slices:\n");
+#ifndef NDEBUG
+  for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+    DEBUG(AS.print(dbgs(), I, "    "));
+#endif
+
+  // Finally, don't try to promote any allocas that new require re-splitting.
+  // They have already been added to the worklist above.
+  PromotableAllocas.erase(
+      std::remove_if(
+          PromotableAllocas.begin(), PromotableAllocas.end(),
+          [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }),
+      PromotableAllocas.end());
+
+  return true;
+}
+
 /// \brief Rewrite an alloca partition's users.
 ///
 /// This routine drives both of the rewriting goals of the SROA pass. It tries
@@ -3582,7 +4062,9 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   unsigned NumPartitions = 0;
   bool Changed = false;
 
-  // Rewrite each parttion.
+  Changed |= presplitLoadsAndStores(AI, AS);
+
+  // Rewrite each partition.
   for (auto &P : AS.partitions()) {
     Changed |= rewritePartition(AI, AS, P);
     ++NumPartitions;
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index dc2b16550a0..914095dc383 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -572,8 +572,7 @@ bad:
 }
 
 define i8 @test12() {
-; We fully promote these to the i24 load or store size, resulting in just masks
-; and other operations that instcombine will fold, but no alloca.
+; We promote these to three SSA values which fold away immediately.
 ;
 ; CHECK-LABEL: @test12(
 
@@ -592,17 +591,6 @@ entry:
   %ai = load i24* %aiptr
 ; CHECK-NOT: store
 ; CHECK-NOT: load
-; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
-; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
-; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]]
-; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
-; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
-; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
-; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256
-; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]]
 
   %biptr = bitcast [3 x i8]* %b to i24*
   store i24 %ai, i24* %biptr
@@ -614,17 +602,12 @@ entry:
   %b2 = load i8* %b2ptr
 ; CHECK-NOT: store
 ; CHECK-NOT: load
-; CHECK:      %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
-; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
-; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
-; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16
-; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
 
   %bsum0 = add i8 %b0, %b1
   %bsum1 = add i8 %bsum0, %b2
   ret i8 %bsum1
-; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
-; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK:      %[[sum0:.*]] = add i8 0, 0
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], 0
 ; CHECK-NEXT: ret i8 %[[sum1]]
 }
 
@@ -1440,3 +1423,36 @@ entry:
   ret void
 }
 
+define float @test25() {
+; Check that we split up stores in order to promote the smaller SSA values.. These types
+; of patterns can arise because LLVM maps small memcpy's to integer load and
+; stores. If we get a memcpy of an aggregate (such as C and C++ frontends would
+; produce, but so might any language frontend), this will in many cases turn into
+; an integer load and store. SROA needs to be extremely powerful to correctly
+; handle these cases and form splitable and promotable SSA values.
+;
+; CHECK-LABEL: @test25(
+; CHECK-NOT: alloca
+; CHECK: %[[F1:.*]] = bitcast i32 0 to float
+; CHECK: %[[F2:.*]] = bitcast i32 1065353216 to float
+; CHECK: %[[SUM:.*]] = fadd float %[[F1]], %[[F2]]
+; CHECK: ret float %[[SUM]]
+
+entry:
+  %a = alloca i64
+  %b = alloca i64
+  %a.cast = bitcast i64* %a to [2 x float]*
+  %a.gep1 = getelementptr [2 x float]* %a.cast, i32 0, i32 0
+  %a.gep2 = getelementptr [2 x float]* %a.cast, i32 0, i32 1
+  %b.cast = bitcast i64* %b to [2 x float]*
+  %b.gep1 = getelementptr [2 x float]* %b.cast, i32 0, i32 0
+  %b.gep2 = getelementptr [2 x float]* %b.cast, i32 0, i32 1
+  store float 0.0, float* %a.gep1
+  store float 1.0, float* %a.gep2
+  %v = load i64* %a
+  store i64 %v, i64* %b
+  %f1 = load float* %b.gep1
+  %f2 = load float* %b.gep2
+  %ret = fadd float %f1, %f2
+  ret float %ret
+}
diff --git a/llvm/test/Transforms/SROA/big-endian.ll b/llvm/test/Transforms/SROA/big-endian.ll
index 9e87a9f073c..1ef074844d1 100644
--- a/llvm/test/Transforms/SROA/big-endian.ll
+++ b/llvm/test/Transforms/SROA/big-endian.ll
@@ -3,65 +3,6 @@
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
-define i8 @test1() {
-; We fully promote these to the i24 load or store size, resulting in just masks
-; and other operations that instcombine will fold, but no alloca. Note this is
-; the same as test12 in basictest.ll, but here we assert big-endian byte
-; ordering.
-;
-; CHECK-LABEL: @test1(
-
-entry:
-  %a = alloca [3 x i8]
-  %b = alloca [3 x i8]
-; CHECK-NOT: alloca
-
-  %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
-  store i8 0, i8* %a0ptr
-  %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
-  store i8 0, i8* %a1ptr
-  %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
-  store i8 0, i8* %a2ptr
-  %aiptr = bitcast [3 x i8]* %a to i24*
-  %ai = load i24* %aiptr
-; CHECK-NOT: store
-; CHECK-NOT: load
-; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
-; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
-; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
-; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
-; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
-; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
-; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
-; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
-; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
-
-  %biptr = bitcast [3 x i8]* %b to i24*
-  store i24 %ai, i24* %biptr
-  %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
-  %b0 = load i8* %b0ptr
-  %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
-  %b1 = load i8* %b1ptr
-  %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
-  %b2 = load i8* %b2ptr
-; CHECK-NOT: store
-; CHECK-NOT: load
-; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
-; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
-; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
-; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
-; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
-
-  %bsum0 = add i8 %b0, %b1
-  %bsum1 = add i8 %bsum0, %b2
-  ret i8 %bsum1
-; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
-; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
-; CHECK-NEXT: ret i8 %[[sum1]]
-}
-
 define i64 @test2() {
 ; Test for various mixed sizes of integer loads and stores all getting
 ; promoted.