1 files changed, 54 insertions, 27 deletions
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index ef91effe3ff..1653859e91c 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -735,16 +735,10 @@ private:
 
   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
                          uint64_t Size, bool IsVolatile) {
-    // We allow splitting of loads and stores where the type is an integer type
-    // and cover the entire alloca. This prevents us from splitting over
-    // eagerly.
-    // FIXME: In the great blue eventually, we should eagerly split all integer
-    // loads and stores, and then have a separate step that merges adjacent
-    // alloca partitions into a single partition suitable for integer widening.
-    // Or we should skip the merge step and rely on GVN and other passes to
-    // merge adjacent loads and stores that survive mem2reg.
-    bool IsSplittable =
-        Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
+    // We allow splitting of non-volatile loads and stores where the type is an
+    // integer type. These may be used to implement 'memcpy' or other "transfer
+    // of bits" patterns.
+    bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
 
     insertUse(I, Offset, Size, IsSplittable);
   }
@@ -2620,7 +2614,8 @@ private:
       // LI only used for this computation.
       Value *Placeholder =
           new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
-      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, "insert");
+      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+                        "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
       delete Placeholder;
@@ -2699,7 +2694,8 @@ private:
                  DL.getTypeStoreSizeInBits(V->getType()) &&
              "Non-byte-multiple bit width");
       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
-      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract");
+      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+                         "extract");
     }
 
     if (VecTy)
@@ -3571,7 +3567,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       assert(Offsets.Splits.empty() &&
              "Should not have splits the first time we see an instruction!");
       Offsets.S = &S;
-      Offsets.Splits.push_back(P.endOffset());
+      Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
     }
 
     // Now scan the already split slices, and add a split for any of them which
@@ -3586,13 +3582,14 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       assert(Offsets.S == S && "Found a mismatched slice!");
       assert(!Offsets.Splits.empty() &&
              "Cannot have an empty set of splits on the second partition!");
-      assert(Offsets.Splits.back() == P.beginOffset() &&
+      assert(Offsets.Splits.back() ==
+                 P.beginOffset() - Offsets.S->beginOffset() &&
              "Previous split does not end where this one begins!");
 
       // Record each split. The last partition's end isn't needed as the size
       // of the slice dictates that.
       if (S->endOffset() > P.endOffset())
-        Offsets.Splits.push_back(P.endOffset());
+        Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
     }
   }
 
@@ -3705,7 +3702,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       NewSlices.push_back(
           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
-                /*IsSplittable*/ true));
+                /*IsSplittable*/ false));
       DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
                    << ", " << NewSlices.back().endOffset() << "): " << *PLoad
                    << "\n");
@@ -3843,7 +3840,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       NewSlices.push_back(
           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
-                /*IsSplittable*/ true));
+                /*IsSplittable*/ false));
       DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
                    << ", " << NewSlices.back().endOffset() << "): " << *PStore
                    << "\n");
@@ -3879,24 +3876,29 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     }
 
     // Mark the original store as dead now that we've split it up and kill its
-    // slice. Note that we leave the original load in place. It may in turn be
-    // split up if it is an alloca load for some other alloca, but it may be
-    // a normal load. This may introduce redundant loads, but where those can
-    // be merged the rest of the optimizer should handle the merging, and this
-    // uncovers SSA splits which is more important. In practice, the original
-    // loads will almost always be fully split and removed eventually, and the
-    // splits will be merged by any trivial CSE, including instcombine.
+    // slice. Note that we leave the original load in place unless this store
+    // was its ownly use. It may in turn be split up if it is an alloca load
+    // for some other alloca, but it may be a normal load. This may introduce
+    // redundant loads, but where those can be merged the rest of the optimizer
+    // should handle the merging, and this uncovers SSA splits which is more
+    // important. In practice, the original loads will almost always be fully
+    // split and removed eventually, and the splits will be merged by any
+    // trivial CSE, including instcombine.
+    if (LI->hasOneUse()) {
+      assert(*LI->user_begin() == SI && "Single use isn't this store!");
+      DeadInsts.insert(LI);
+    }
     DeadInsts.insert(SI);
     Offsets.S->kill();
   }
 
-  // Now we need to remove the killed slices, sort the newly added slices, and
-  // merge the two sorted ranges of slices so that the entire range is sorted
-  // properly for us to re-compute the partitions.
+  // Remove the killed slices that have ben pre-split.
   AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
     return S.isDead();
   }), AS.end());
 
+  // Insert our new slices. This will sort and merge them into the sorted
+  // sequence.
   AS.insert(NewSlices);
 
   DEBUG(dbgs() << "  Pre-split slices:\n");
@@ -4072,8 +4074,33 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   unsigned NumPartitions = 0;
   bool Changed = false;
 
+  // First try to pre-split loads and stores.
   Changed |= presplitLoadsAndStores(AI, AS);
 
+  // Now that we have identified any pre-splitting opportunities, mark any
+  // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+  // to split these during pre-splitting, we want to force them to be
+  // rewritten into a partition.
+  bool IsSorted = true;
+  for (Slice &S : AS) {
+    if (!S.isSplittable())
+      continue;
+    // FIXME: We currently leave whole-alloca splittable loads and stores. This
+    // used to be the only splittable loads and stores and we need to be
+    // confident that the above handling of splittable loads and stores is
+    // completely sufficient before we forcibly disable the remaining handling.
+    if (S.beginOffset() == 0 &&
+        S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType()))
+      continue;
+    if (isa<LoadInst>(S.getUse()->getUser()) ||
+        isa<StoreInst>(S.getUse()->getUser())) {
+      S.makeUnsplittable();
+      IsSorted = false;
+    }
+  }
+  if (!IsSorted)
+    std::sort(AS.begin(), AS.end());
+
   // Rewrite each partition.
   for (auto &P : AS.partitions()) {
     Changed |= rewritePartition(AI, AS, P);