diff options
| -rw-r--r-- | llvm/lib/Transforms/Scalar/SROA.cpp | 81 | ||||
| -rw-r--r-- | llvm/test/Transforms/SROA/alignment.ll | 13 | ||||
| -rw-r--r-- | llvm/test/Transforms/SROA/basictest.ll | 55 | ||||
| -rw-r--r-- | llvm/test/Transforms/SROA/big-endian.ll | 59 |
4 files changed, 173 insertions, 35 deletions
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index ef91effe3ff..1653859e91c 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -735,16 +735,10 @@ private: void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, uint64_t Size, bool IsVolatile) { - // We allow splitting of loads and stores where the type is an integer type - // and cover the entire alloca. This prevents us from splitting over - // eagerly. - // FIXME: In the great blue eventually, we should eagerly split all integer - // loads and stores, and then have a separate step that merges adjacent - // alloca partitions into a single partition suitable for integer widening. - // Or we should skip the merge step and rely on GVN and other passes to - // merge adjacent loads and stores that survive mem2reg. - bool IsSplittable = - Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize; + // We allow splitting of non-volatile loads and stores where the type is an + // integer type. These may be used to implement 'memcpy' or other "transfer + // of bits" patterns. + bool IsSplittable = Ty->isIntegerTy() && !IsVolatile; insertUse(I, Offset, Size, IsSplittable); } @@ -2620,7 +2614,8 @@ private: // LI only used for this computation. Value *Placeholder = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); - V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, "insert"); + V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, + "insert"); LI.replaceAllUsesWith(V); Placeholder->replaceAllUsesWith(&LI); delete Placeholder; @@ -2699,7 +2694,8 @@ private: DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); - V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract"); + V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset, + "extract"); } if (VecTy) @@ -3571,7 +3567,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { assert(Offsets.Splits.empty() && "Should not have splits the first time we see an instruction!"); Offsets.S = &S; - Offsets.Splits.push_back(P.endOffset()); + Offsets.Splits.push_back(P.endOffset() - S.beginOffset()); } // Now scan the already split slices, and add a split for any of them which @@ -3586,13 +3582,14 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { assert(Offsets.S == S && "Found a mismatched slice!"); assert(!Offsets.Splits.empty() && "Cannot have an empty set of splits on the second partition!"); - assert(Offsets.Splits.back() == P.beginOffset() && + assert(Offsets.Splits.back() == + P.beginOffset() - Offsets.S->beginOffset() && "Previous split does not end where this one begins!"); // Record each split. The last partition's end isn't needed as the size // of the slice dictates that. if (S->endOffset() > P.endOffset()) - Offsets.Splits.push_back(P.endOffset()); + Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset()); } } @@ -3705,7 +3702,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { NewSlices.push_back( Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), - /*IsSplittable*/ true)); + /*IsSplittable*/ false)); DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() << ", " << NewSlices.back().endOffset() << "): " << *PLoad << "\n"); @@ -3843,7 +3840,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { NewSlices.push_back( Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, &PStore->getOperandUse(PStore->getPointerOperandIndex()), - /*IsSplittable*/ true)); + /*IsSplittable*/ false)); DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() << ", " << NewSlices.back().endOffset() << "): " << *PStore << "\n"); @@ -3879,24 +3876,29 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } // Mark the original store as dead now that we've split it up and kill its - // slice. Note that we leave the original load in place. It may in turn be - // split up if it is an alloca load for some other alloca, but it may be - // a normal load. This may introduce redundant loads, but where those can - // be merged the rest of the optimizer should handle the merging, and this - // uncovers SSA splits which is more important. In practice, the original - // loads will almost always be fully split and removed eventually, and the - // splits will be merged by any trivial CSE, including instcombine. + // slice. Note that we leave the original load in place unless this store + // was its ownly use. It may in turn be split up if it is an alloca load + // for some other alloca, but it may be a normal load. This may introduce + // redundant loads, but where those can be merged the rest of the optimizer + // should handle the merging, and this uncovers SSA splits which is more + // important. In practice, the original loads will almost always be fully + // split and removed eventually, and the splits will be merged by any + // trivial CSE, including instcombine. + if (LI->hasOneUse()) { + assert(*LI->user_begin() == SI && "Single use isn't this store!"); + DeadInsts.insert(LI); + } DeadInsts.insert(SI); Offsets.S->kill(); } - // Now we need to remove the killed slices, sort the newly added slices, and - // merge the two sorted ranges of slices so that the entire range is sorted - // properly for us to re-compute the partitions. + // Remove the killed slices that have ben pre-split. AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) { return S.isDead(); }), AS.end()); + // Insert our new slices. This will sort and merge them into the sorted + // sequence. AS.insert(NewSlices); DEBUG(dbgs() << " Pre-split slices:\n"); @@ -4072,8 +4074,33 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { unsigned NumPartitions = 0; bool Changed = false; + // First try to pre-split loads and stores. Changed |= presplitLoadsAndStores(AI, AS); + // Now that we have identified any pre-splitting opportunities, mark any + // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail + // to split these during pre-splitting, we want to force them to be + // rewritten into a partition. + bool IsSorted = true; + for (Slice &S : AS) { + if (!S.isSplittable()) + continue; + // FIXME: We currently leave whole-alloca splittable loads and stores. This + // used to be the only splittable loads and stores and we need to be + // confident that the above handling of splittable loads and stores is + // completely sufficient before we forcibly disable the remaining handling. + if (S.beginOffset() == 0 && + S.endOffset() >= DL->getTypeAllocSize(AI.getAllocatedType())) + continue; + if (isa<LoadInst>(S.getUse()->getUser()) || + isa<StoreInst>(S.getUse()->getUser())) { + S.makeUnsplittable(); + IsSorted = false; + } + } + if (!IsSorted) + std::sort(AS.begin(), AS.end()); + // Rewrite each partition. for (auto &P : AS.partitions()) { Changed |= rewritePartition(AI, AS, P); diff --git a/llvm/test/Transforms/SROA/alignment.ll b/llvm/test/Transforms/SROA/alignment.ll index 5fa78766ed0..4f4e40cf76a 100644 --- a/llvm/test/Transforms/SROA/alignment.ll +++ b/llvm/test/Transforms/SROA/alignment.ll @@ -85,15 +85,18 @@ entry: } define void @test5() { -; Test that we preserve underaligned loads and stores when splitting. +; Test that we preserve underaligned loads and stores when splitting. The use +; of volatile in this test case is just to force the loads and stores to not be +; split or promoted out of existence. +; ; CHECK-LABEL: @test5( ; CHECK: alloca [9 x i8] ; CHECK: alloca [9 x i8] ; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1 -; CHECK: load i16* %{{.*}}, align 1 +; CHECK: load volatile i16* %{{.*}}, align 1 ; CHECK: load double* %{{.*}}, align 1 ; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1 -; CHECK: load i16* %{{.*}}, align 1 +; CHECK: load volatile i16* %{{.*}}, align 1 ; CHECK: ret void entry: @@ -103,7 +106,7 @@ entry: store volatile double 0.0, double* %ptr1, align 1 %weird_gep1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 7 %weird_cast1 = bitcast i8* %weird_gep1 to i16* - %weird_load1 = load i16* %weird_cast1, align 1 + %weird_load1 = load volatile i16* %weird_cast1, align 1 %raw2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 9 %ptr2 = bitcast i8* %raw2 to double* @@ -111,7 +114,7 @@ entry: store volatile double %d1, double* %ptr2, align 1 %weird_gep2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 16 %weird_cast2 = bitcast i8* %weird_gep2 to i16* - %weird_load2 = load i16* %weird_cast2, align 1 + %weird_load2 = load volatile i16* %weird_cast2, align 1 ret void } diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll index 7cedca56c48..9edb2b42dc0 100644 --- a/llvm/test/Transforms/SROA/basictest.ll +++ b/llvm/test/Transforms/SROA/basictest.ll @@ -572,7 +572,8 @@ bad: } define i8 @test12() { -; We promote these to three SSA values which fold away immediately. +; We fully promote these to the i24 load or store size, resulting in just masks +; and other operations that instcombine will fold, but no alloca. ; ; CHECK-LABEL: @test12( @@ -591,6 +592,17 @@ entry: %ai = load i24* %aiptr ; CHECK-NOT: store ; CHECK-NOT: load +; CHECK: %[[ext2:.*]] = zext i8 0 to i24 +; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16 +; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535 +; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]] +; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24 +; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8 +; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281 +; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]] +; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24 +; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256 +; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]] %biptr = bitcast [3 x i8]* %b to i24* store i24 %ai, i24* %biptr @@ -602,12 +614,17 @@ entry: %b2 = load i8* %b2ptr ; CHECK-NOT: store ; CHECK-NOT: load +; CHECK: %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8 +; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8 +; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8 +; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16 +; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8 %bsum0 = add i8 %b0, %b1 %bsum1 = add i8 %bsum0, %b2 ret i8 %bsum1 -; CHECK: %[[sum0:.*]] = add i8 0, 0 -; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], 0 +; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]] +; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]] ; CHECK-NEXT: ret i8 %[[sum1]] } @@ -1492,3 +1509,35 @@ entry: store i64 %v2, i64* bitcast ([2 x float]* @complex2 to i64*) ret void } + +define float @test27() { +; Another, more complex case of splittable i64 loads and stores. This example +; is a particularly challenging one because the load and store both point into +; the alloca SROA is processing, and they overlap but at an offset. +; +; CHECK-LABEL: @test27( +; CHECK-NOT: alloca +; CHECK: %[[F1:.*]] = bitcast i32 0 to float +; CHECK: %[[F2:.*]] = bitcast i32 1065353216 to float +; CHECK: %[[SUM:.*]] = fadd float %[[F1]], %[[F2]] +; CHECK: ret float %[[SUM]] + +entry: + %a = alloca [12 x i8] + %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0 + %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4 + %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8 + %iptr1 = bitcast i8* %gep1 to i64* + %iptr2 = bitcast i8* %gep2 to i64* + %fptr1 = bitcast i8* %gep1 to float* + %fptr2 = bitcast i8* %gep2 to float* + %fptr3 = bitcast i8* %gep3 to float* + store float 0.0, float* %fptr1 + store float 1.0, float* %fptr2 + %v = load i64* %iptr1 + store i64 %v, i64* %iptr2 + %f1 = load float* %fptr2 + %f2 = load float* %fptr3 + %ret = fadd float %f1, %f2 + ret float %ret +} diff --git a/llvm/test/Transforms/SROA/big-endian.ll b/llvm/test/Transforms/SROA/big-endian.ll index 1ef074844d1..9e87a9f073c 100644 --- a/llvm/test/Transforms/SROA/big-endian.ll +++ b/llvm/test/Transforms/SROA/big-endian.ll @@ -3,6 +3,65 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" +define i8 @test1() { +; We fully promote these to the i24 load or store size, resulting in just masks +; and other operations that instcombine will fold, but no alloca. Note this is +; the same as test12 in basictest.ll, but here we assert big-endian byte +; ordering. +; +; CHECK-LABEL: @test1( + +entry: + %a = alloca [3 x i8] + %b = alloca [3 x i8] +; CHECK-NOT: alloca + + %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0 + store i8 0, i8* %a0ptr + %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1 + store i8 0, i8* %a1ptr + %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2 + store i8 0, i8* %a2ptr + %aiptr = bitcast [3 x i8]* %a to i24* + %ai = load i24* %aiptr +; CHECK-NOT: store +; CHECK-NOT: load +; CHECK: %[[ext2:.*]] = zext i8 0 to i24 +; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256 +; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]] +; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24 +; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8 +; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281 +; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]] +; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24 +; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16 +; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535 +; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]] + + %biptr = bitcast [3 x i8]* %b to i24* + store i24 %ai, i24* %biptr + %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0 + %b0 = load i8* %b0ptr + %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1 + %b1 = load i8* %b1ptr + %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2 + %b2 = load i8* %b2ptr +; CHECK-NOT: store +; CHECK-NOT: load +; CHECK: %[[shift0:.*]] = lshr i24 %[[insert0]], 16 +; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8 +; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8 +; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8 +; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8 + + %bsum0 = add i8 %b0, %b1 + %bsum1 = add i8 %bsum0, %b2 + ret i8 %bsum1 +; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]] +; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]] +; CHECK-NEXT: ret i8 %[[sum1]] +} + define i64 @test2() { ; Test for various mixed sizes of integer loads and stores all getting ; promoted. |

