summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp81
-rw-r--r--llvm/test/Transforms/SROA/alignment.ll13
-rw-r--r--llvm/test/Transforms/SROA/basictest.ll55
-rw-r--r--llvm/test/Transforms/SROA/big-endian.ll59
4 files changed, 173 insertions, 35 deletions
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index ef91effe3ff..1653859e91c 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -735,16 +735,10 @@ private:
void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
uint64_t Size, bool IsVolatile) {
- // We allow splitting of loads and stores where the type is an integer type
- // and cover the entire alloca. This prevents us from splitting over
- // eagerly.
- // FIXME: In the great blue eventually, we should eagerly split all integer
- // loads and stores, and then have a separate step that merges adjacent
- // alloca partitions into a single partition suitable for integer widening.
- // Or we should skip the merge step and rely on GVN and other passes to
- // merge adjacent loads and stores that survive mem2reg.
- bool IsSplittable =
- Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
+ // We allow splitting of non-volatile loads and stores where the type is an
+ // integer type. These may be used to implement 'memcpy' or other "transfer
+ // of bits" patterns.
+ bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
insertUse(I, Offset, Size, IsSplittable);
}
@@ -2620,7 +2614,8 @@ private:
// LI only used for this computation.
Value *Placeholder =
new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
- V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, "insert");
+ V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+ "insert");
LI.replaceAllUsesWith(V);
Placeholder->replaceAllUsesWith(&LI);
delete Placeholder;
@@ -2699,7 +2694,8 @@ private:
DL.getTypeStoreSizeInBits(V->getType()) &&
"Non-byte-multiple bit width");
IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
- V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract");
+ V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+ "extract");
}
if (VecTy)
@@ -3571,7 +3567,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
assert(Offsets.Splits.empty() &&
"Should not have splits the first time we see an instruction!");
Offsets.S = &S;
- Offsets.Splits.push_back(P.endOffset());
+ Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
}
// Now scan the already split slices, and add a split for any of them which
@@ -3586,13 +3582,14 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
assert(Offsets.S == S && "Found a mismatched slice!");
assert(!Offsets.Splits.empty() &&
"Cannot have an empty set of splits on the second partition!");
- assert(Offsets.Splits.back() == P.beginOffset() &&
+ assert(Offsets.Splits.back() ==
+ P.beginOffset() - Offsets.S->beginOffset() &&
"Previous split does not end where this one begins!");
// Record each split. The last partition's end isn't needed as the size
// of the slice dictates that.
if (S->endOffset() > P.endOffset())
- Offsets.Splits.push_back(P.endOffset());
+ Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
}
}
@@ -3705,7 +3702,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
NewSlices.push_back(
Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
&PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
- /*IsSplittable*/ true));
+ /*IsSplittable*/ false));
DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
<< ", " << NewSlices.back().endOffset() << "): " << *PLoad
<< "\n");
@@ -3843,7 +3840,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
NewSlices.push_back(
Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
&PStore->getOperandUse(PStore->getPointerOperandIndex()),
- /*IsSplittable*/ true));
+ /*IsSplittable*/ false));
DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
<< ", " << NewSlices.back().endOffset() << "): " << *PStore
<< "\n");
@@ -3879,24 +3876,29 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
}
// Mark the original store as dead now that we've split it up and kill its
- // slice. Note that we leave the original load in place. It may in turn be
- // split up if it is an alloca load for some other alloca, but it may be
- // a normal load. This may introduce redundant loads, but where those can
- // be merged the rest of the optimizer should handle the merging, and this
- // uncovers SSA splits which is more important. In practice, the original
- // loads will almost always be fully split and removed eventually, and the
- // splits will be merged by any trivial CSE, including instcombine.
+ // slice. Note that we leave the original load in place unless this store
+ // was its ownly use. It may in turn be split up if it is an alloca load
+ // for some other alloca, but it may be a normal load. This may introduce
+ // redundant loads, but where those can be merged the rest of the optimizer
+ // should handle the merging, and this uncovers SSA splits which is more
+ // important. In practice, the original loads will almost always be fully
+ // split and removed eventually, and the splits will be merged by any
+ // trivial CSE, including instcombine.
+ if (LI->hasOneUse()) {
+ assert(*LI->user_begin() == SI && "Single use isn't this store!");
+ DeadInsts.insert(LI);
+ }
DeadInsts.insert(SI);
Offsets.S->kill();
}
- // Now we need to remove the killed slices, sort the newly added slices, and
- // merge the two sorted ranges of slices so that the entire range is sorted
- // properly for us to re-compute the partitions.
+ // Remove the killed slices that have ben pre-split.
AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) {
return S.isDead();
}), AS.end());
+ // Insert our new slices. This will sort and merge them into the sorted
+ // sequence.
AS.insert(NewSlices);
DEBUG(dbgs() << " Pre-split slices:\n");
@@ -4072,8 +4074,33 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
unsigned NumPartitions = 0;
bool Changed = false;
+ // First try to pre-split loads and stores.
Changed |= presplitLoadsAndStores(AI, AS);
+ // Now that we have identified any pre-splitting opportunities, mark any
+ // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
+ // to split these during pre-splitting, we want to force them to be
+ // rewritten into a partition.
+ bool IsSorted = true;
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
+ continue;
+ // FIXME: We currently leave whole-alloca splittable loads and stores. This
+ // used to be the only splittable loads and stores and we need to be
+ // confident that the above handling of splittable loads and stores is
+ // completely sufficient before we forcibly disable the remaining handling.
+ if (S.beginOffset() == 0 &&
+ S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType()))
+ continue;
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
+ }
+ }
+ if (!IsSorted)
+ std::sort(AS.begin(), AS.end());
+
// Rewrite each partition.
for (auto &P : AS.partitions()) {
Changed |= rewritePartition(AI, AS, P);
diff --git a/llvm/test/Transforms/SROA/alignment.ll b/llvm/test/Transforms/SROA/alignment.ll
index 5fa78766ed0..4f4e40cf76a 100644
--- a/llvm/test/Transforms/SROA/alignment.ll
+++ b/llvm/test/Transforms/SROA/alignment.ll
@@ -85,15 +85,18 @@ entry:
}
define void @test5() {
-; Test that we preserve underaligned loads and stores when splitting.
+; Test that we preserve underaligned loads and stores when splitting. The use
+; of volatile in this test case is just to force the loads and stores to not be
+; split or promoted out of existence.
+;
; CHECK-LABEL: @test5(
; CHECK: alloca [9 x i8]
; CHECK: alloca [9 x i8]
; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
-; CHECK: load i16* %{{.*}}, align 1
+; CHECK: load volatile i16* %{{.*}}, align 1
; CHECK: load double* %{{.*}}, align 1
; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
-; CHECK: load i16* %{{.*}}, align 1
+; CHECK: load volatile i16* %{{.*}}, align 1
; CHECK: ret void
entry:
@@ -103,7 +106,7 @@ entry:
store volatile double 0.0, double* %ptr1, align 1
%weird_gep1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 7
%weird_cast1 = bitcast i8* %weird_gep1 to i16*
- %weird_load1 = load i16* %weird_cast1, align 1
+ %weird_load1 = load volatile i16* %weird_cast1, align 1
%raw2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 9
%ptr2 = bitcast i8* %raw2 to double*
@@ -111,7 +114,7 @@ entry:
store volatile double %d1, double* %ptr2, align 1
%weird_gep2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 16
%weird_cast2 = bitcast i8* %weird_gep2 to i16*
- %weird_load2 = load i16* %weird_cast2, align 1
+ %weird_load2 = load volatile i16* %weird_cast2, align 1
ret void
}
diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll
index 7cedca56c48..9edb2b42dc0 100644
--- a/llvm/test/Transforms/SROA/basictest.ll
+++ b/llvm/test/Transforms/SROA/basictest.ll
@@ -572,7 +572,8 @@ bad:
}
define i8 @test12() {
-; We promote these to three SSA values which fold away immediately.
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca.
;
; CHECK-LABEL: @test12(
@@ -591,6 +592,17 @@ entry:
%ai = load i24* %aiptr
; CHECK-NOT: store
; CHECK-NOT: load
+; CHECK: %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]]
%biptr = bitcast [3 x i8]* %b to i24*
store i24 %ai, i24* %biptr
@@ -602,12 +614,17 @@ entry:
%b2 = load i8* %b2ptr
; CHECK-NOT: store
; CHECK-NOT: load
+; CHECK: %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
%bsum0 = add i8 %b0, %b1
%bsum1 = add i8 %bsum0, %b2
ret i8 %bsum1
-; CHECK: %[[sum0:.*]] = add i8 0, 0
-; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], 0
+; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
; CHECK-NEXT: ret i8 %[[sum1]]
}
@@ -1492,3 +1509,35 @@ entry:
store i64 %v2, i64* bitcast ([2 x float]* @complex2 to i64*)
ret void
}
+
+define float @test27() {
+; Another, more complex case of splittable i64 loads and stores. This example
+; is a particularly challenging one because the load and store both point into
+; the alloca SROA is processing, and they overlap but at an offset.
+;
+; CHECK-LABEL: @test27(
+; CHECK-NOT: alloca
+; CHECK: %[[F1:.*]] = bitcast i32 0 to float
+; CHECK: %[[F2:.*]] = bitcast i32 1065353216 to float
+; CHECK: %[[SUM:.*]] = fadd float %[[F1]], %[[F2]]
+; CHECK: ret float %[[SUM]]
+
+entry:
+ %a = alloca [12 x i8]
+ %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+ %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+ %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+ %iptr1 = bitcast i8* %gep1 to i64*
+ %iptr2 = bitcast i8* %gep2 to i64*
+ %fptr1 = bitcast i8* %gep1 to float*
+ %fptr2 = bitcast i8* %gep2 to float*
+ %fptr3 = bitcast i8* %gep3 to float*
+ store float 0.0, float* %fptr1
+ store float 1.0, float* %fptr2
+ %v = load i64* %iptr1
+ store i64 %v, i64* %iptr2
+ %f1 = load float* %fptr2
+ %f2 = load float* %fptr3
+ %ret = fadd float %f1, %f2
+ ret float %ret
+}
diff --git a/llvm/test/Transforms/SROA/big-endian.ll b/llvm/test/Transforms/SROA/big-endian.ll
index 1ef074844d1..9e87a9f073c 100644
--- a/llvm/test/Transforms/SROA/big-endian.ll
+++ b/llvm/test/Transforms/SROA/big-endian.ll
@@ -3,6 +3,65 @@
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+define i8 @test1() {
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca. Note this is
+; the same as test12 in basictest.ll, but here we assert big-endian byte
+; ordering.
+;
+; CHECK-LABEL: @test1(
+
+entry:
+ %a = alloca [3 x i8]
+ %b = alloca [3 x i8]
+; CHECK-NOT: alloca
+
+ %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+ store i8 0, i8* %a0ptr
+ %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+ store i8 0, i8* %a1ptr
+ %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+ store i8 0, i8* %a2ptr
+ %aiptr = bitcast [3 x i8]* %a to i24*
+ %ai = load i24* %aiptr
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
+
+ %biptr = bitcast [3 x i8]* %b to i24*
+ store i24 %ai, i24* %biptr
+ %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
+ %b0 = load i8* %b0ptr
+ %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
+ %b1 = load i8* %b1ptr
+ %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
+ %b2 = load i8* %b2ptr
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: %[[shift0:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
+
+ %bsum0 = add i8 %b0, %b1
+ %bsum1 = add i8 %bsum0, %b2
+ ret i8 %bsum1
+; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
+}
+
define i64 @test2() {
; Test for various mixed sizes of integer loads and stores all getting
; promoted.
OpenPOWER on IntegriCloud