4 files changed, 173 insertions, 35 deletions
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index ef91effe3ff..1653859e91c 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -735,16 +735,10 @@ private:
 
   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
                          uint64_t Size, bool IsVolatile) {
-    // We allow splitting of loads and stores where the type is an integer type
-    // and cover the entire alloca. This prevents us from splitting over
-    // eagerly.
-    // FIXME: In the great blue eventually, we should eagerly split all integer
-    // loads and stores, and then have a separate step that merges adjacent
-    // alloca partitions into a single partition suitable for integer widening.
-    // Or we should skip the merge step and rely on GVN and other passes to
-    // merge adjacent loads and stores that survive mem2reg.
-    bool IsSplittable =
-        Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
+    // We allow splitting of non-volatile loads and stores where the type is an
+    // integer type. These may be used to implement 'memcpy' or other "transfer
+    // of bits" patterns.
+    bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
 
     insertUse(I, Offset, Size, IsSplittable);
   }
@@ -2620,7 +2614,8 @@ private:
       // LI only used for this computation.
       Value *Placeholder =
           new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
-      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, "insert");
+      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+                        "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
       delete Placeholder;
@@ -2699,7 +2694,8 @@ private:
                  DL.getTypeStoreSizeInBits(V->getType()) &&
              "Non-byte-multiple bit width");
       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
-      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract");
+      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+                         "extract");
     }
 
     if (VecTy)
@@ -3571,7 +3567,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       assert(Offsets.Splits.empty() &&
              "Should not have splits the first time we see an instruction!");
       Offsets.S = &S;
-      Offsets.Splits.push_back(P.endOffset());
+      Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
     }
 
     // Now scan the already split slices, and add a split for any of them which
@@ -3586,13 +3582,14 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       assert(Offsets.S == S && "Found a mismatched slice!");
       assert(!Offsets.Splits.empty() &&
              "Cannot have an empty set of splits on the second partition!");
-      assert(Offsets.Splits.back() == P.beginOffset() &&
+      assert(Offsets.Splits.back() ==
+                 P.beginOffset() - Offsets.S->beginOffset() &&
              "Previous split does not end where this one begins!");
 
       // Record each split. The last partition's end isn't needed as the size
       // of the slice dictates that.
       if (S->endOffset() > P.endOffset())
-        Offsets.Splits.push_back(P.endOffset());
+        Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
     }
   }
 
@@ -3705,7 +3702,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       NewSlices.push_back(
           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
-                /*IsSplittable*/ true));
+                /*IsSplittable*/ false));
       DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
                    << ", " << NewSlices.back().endOffset() << "): " << *PLoad
                    << "\n");
@@ -3843,7 +3840,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       NewSlices.push_back(
           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
                 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
-                /*IsSplittable*/ true));
+                /*IsSplittable*/ false));
       DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
                    << ", " << NewSlices.back().endOffset() << "): " << *PStore
                    << "\n");
@@ -3879,24 +3876,29 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     }
 
     // Mark the original store as dead now that we've split it up and kill its
-    // slice. Note that we leave the original load in place. It may in turn be
-    // split up if it is an alloca load for some other alloca, but it may be
-    // a normal load. This may introduce redundant loads, but where those can
-    // be merged the rest of the optimizer should handle the merging, and this
-    // uncovers SSA splits which is more important. In practice, the original
-    // loads will almost always be fully split and removed eventually, and the
-    // splits will be merged by any trivial CSE, including instcombine.
+    // slice. Note that we leave the original load in place unless this store
+    // was its ownly use. It may in turn be split up if it is an alloca load
+    // for some other alloca, but it may be a normal load. This may introduce
+    // redundant loads, but where those can be merged the rest of the optimizer
+    // should handle the merging, and this uncovers SSA splits which is more
+    // important. In practice, the original loads will almost always be fully
+    // split and removed eventually, and the splits will be merged by any
+    // trivial CSE, including instcombine.
+    if (LI->hasOneUse()) {
+      assert(*LI->user_begin() == SI && "Single use isn't this store!");
+      DeadInsts.insert(LI);
+    }
     DeadInsts.insert(SI);
     Offsets.S->kill();
   }
 
-  // Now we need to remove the killed slices, sort the newly added slices, and
-  // merge the two sorted ranges of slices so that the entire range is sorted
-  // properly for us to re-compute the partitions.
+  // Remove the killed slices that have ben pre-split.
   AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
     return S.isDead();
   }), AS.end());
 
+  // Insert our new slices. This will sort and merge them into the sorted
+  // sequence.
   AS.insert(NewSlices);
 
   DEBUG(dbgs() << "  Pre-split slices:\n");
@@ -4072,8 +4074,33 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   unsigned NumPartitions = 0;
   bool Changed = false;
 
+  // First try to pre-split loads and stores.
   Changed |= presplitLoadsAndStores(AI, AS);
 
+  // Now that we have identified any pre-splitting opportunities, mark any
+  // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+  // to split these during pre-splitting, we want to force them to be
+  // rewritten into a partition.
+  bool IsSorted = true;
+  for (Slice &S : AS) {
+    if (!S.isSplittable())
+      continue;
+    // FIXME: We currently leave whole-alloca splittable loads and stores. This
+    // used to be the only splittable loads and stores and we need to be
+    // confident that the above handling of splittable loads and stores is
+    // completely sufficient before we forcibly disable the remaining handling.
+    if (S.beginOffset() == 0 &&
+        S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType()))
+      continue;
+    if (isa<LoadInst>(S.getUse()->getUser()) ||
+        isa<StoreInst>(S.getUse()->getUser())) {
+      S.makeUnsplittable();
+      IsSorted = false;
+    }
+  }
+  if (!IsSorted)
+    std::sort(AS.begin(), AS.end());
+
   // Rewrite each partition.
   for (auto &P : AS.partitions()) {
     Changed |= rewritePartition(AI, AS, P);
diff --git a/llvm/test/Transforms/SROA/alignment.ll b/llvm/test/Transforms/SROA/alignment.ll
index 5fa78766ed0..4f4e40cf76a 100644
--- a/llvm/test/Transforms/SROA/alignment.ll
+++ b/llvm/test/Transforms/SROA/alignment.ll
@@ -85,15 +85,18 @@ entry:
 }
 
 define void @test5() {
-; Test that we preserve underaligned loads and stores when splitting.
+; Test that we preserve underaligned loads and stores when splitting. The use
+; of volatile in this test case is just to force the loads and stores to not be
+; split or promoted out of existence.
+;
 ; CHECK-LABEL: @test5(
 ; CHECK: alloca [9 x i8]
 ; CHECK: alloca [9 x i8]
 ; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
-; CHECK: load i16* %{{.*}}, align 1
+; CHECK: load volatile i16* %{{.*}}, align 1
 ; CHECK: load double* %{{.*}}, align 1
 ; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
-; CHECK: load i16* %{{.*}}, align 1
+; CHECK: load volatile i16* %{{.*}}, align 1
 ; CHECK: ret void
 
 entry:
@@ -103,7 +106,7 @@ entry:
   store volatile double 0.0, double* %ptr1, align 1
   %weird_gep1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 7
   %weird_cast1 = bitcast i8* %weird_gep1 to i16*
-  %weird_load1 = load i16* %weird_cast1, align 1
+  %weird_load1 = load volatile i16* %weird_cast1, align 1
 
   %raw2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 9
   %ptr2 = bitcast i8* %raw2 to double*
@@ -111,7 +114,7 @@ entry:
   store volatile double %d1, double* %ptr2, align 1
   %weird_gep2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 16
   %weird_cast2 = bitcast i8* %weird_gep2 to i16*
-  %weird_load2 = load i16* %weird_cast2, align 1
+  %weird_load2 = load volatile i16* %weird_cast2, align 1
 
   ret void
 }
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 7cedca56c48..9edb2b42dc0 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -572,7 +572,8 @@ bad:
 }
 
 define i8 @test12() {
-; We promote these to three SSA values which fold away immediately.
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca.
 ;
 ; CHECK-LABEL: @test12(
 
@@ -591,6 +592,17 @@ entry:
   %ai = load i24* %aiptr
 ; CHECK-NOT: store
 ; CHECK-NOT: load
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]]
 
   %biptr = bitcast [3 x i8]* %b to i24*
   store i24 %ai, i24* %biptr
@@ -602,12 +614,17 @@ entry:
   %b2 = load i8* %b2ptr
 ; CHECK-NOT: store
 ; CHECK-NOT: load
+; CHECK:      %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
 
   %bsum0 = add i8 %b0, %b1
   %bsum1 = add i8 %bsum0, %b2
   ret i8 %bsum1
-; CHECK:      %[[sum0:.*]] = add i8 0, 0
-; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], 0
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
 ; CHECK-NEXT: ret i8 %[[sum1]]
 }
 
@@ -1492,3 +1509,35 @@ entry:
   store i64 %v2, i64* bitcast ([2 x float]* @complex2 to i64*)
   ret void
 }
+
+define float @test27() {
+; Another, more complex case of splittable i64 loads and stores. This example
+; is a particularly challenging one because the load and store both point into
+; the alloca SROA is processing, and they overlap but at an offset.
+;
+; CHECK-LABEL: @test27(
+; CHECK-NOT: alloca
+; CHECK: %[[F1:.*]] = bitcast i32 0 to float
+; CHECK: %[[F2:.*]] = bitcast i32 1065353216 to float
+; CHECK: %[[SUM:.*]] = fadd float %[[F1]], %[[F2]]
+; CHECK: ret float %[[SUM]]
+
+entry:
+  %a = alloca [12 x i8]
+  %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+  %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+  %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+  %iptr1 = bitcast i8* %gep1 to i64*
+  %iptr2 = bitcast i8* %gep2 to i64*
+  %fptr1 = bitcast i8* %gep1 to float*
+  %fptr2 = bitcast i8* %gep2 to float*
+  %fptr3 = bitcast i8* %gep3 to float*
+  store float 0.0, float* %fptr1
+  store float 1.0, float* %fptr2
+  %v = load i64* %iptr1
+  store i64 %v, i64* %iptr2
+  %f1 = load float* %fptr2
+  %f2 = load float* %fptr3
+  %ret = fadd float %f1, %f2
+  ret float %ret
+}
diff --git a/llvm/test/Transforms/SROA/big-endian.ll b/llvm/test/Transforms/SROA/big-endian.ll
index 1ef074844d1..9e87a9f073c 100644
--- a/llvm/test/Transforms/SROA/big-endian.ll
+++ b/llvm/test/Transforms/SROA/big-endian.ll
@@ -3,6 +3,65 @@
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
+define i8 @test1() {
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca. Note this is
+; the same as test12 in basictest.ll, but here we assert big-endian byte
+; ordering.
+;
+; CHECK-LABEL: @test1(
+
+entry:
+  %a = alloca [3 x i8]
+  %b = alloca [3 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %a0ptr
+  %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %a1ptr
+  %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %a2ptr
+  %aiptr = bitcast [3 x i8]* %a to i24*
+  %ai = load i24* %aiptr
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
+
+  %biptr = bitcast [3 x i8]* %b to i24*
+  store i24 %ai, i24* %biptr
+  %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
+  %b0 = load i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
+  %b1 = load i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
+  %b2 = load i8* %b2ptr
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
+
+  %bsum0 = add i8 %b0, %b1
+  %bsum1 = add i8 %bsum0, %b2
+  ret i8 %bsum1
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
+}
+
 define i64 @test2() {
 ; Test for various mixed sizes of integer loads and stores all getting
 ; promoted.