diff options
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/lib/Transforms/Scalar/SROA.cpp | 486 | ||||
-rw-r--r-- | llvm/test/Transforms/SROA/basictest.ll | 56 | ||||
-rw-r--r-- | llvm/test/Transforms/SROA/big-endian.ll | 59 |
3 files changed, 520 insertions, 81 deletions
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 688755f0704..48f38f7f473 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -237,6 +237,24 @@ public: const_iterator end() const { return Slices.end(); } /// @} + /// \brief Erase a range of slices. + void erase(iterator Start, iterator Stop) { + Slices.erase(Start, Stop); + } + + /// \brief Insert new slices for this alloca. + /// + /// This moves the slices into the alloca's slices collection, and re-sorts + /// everything so that the usual ordering properties of the alloca's slices + /// hold. + void insert(ArrayRef<Slice> NewSlices) { + int OldSize = Slices.size(); + std::move(NewSlices.begin(), NewSlices.end(), std::back_inserter(Slices)); + auto SliceI = Slices.begin() + OldSize; + std::sort(SliceI, Slices.end()); + std::inplace_merge(Slices.begin(), SliceI, Slices.end()); + } + // Forward declare an iterator to befriend it. class partition_iterator; @@ -1022,6 +1040,7 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) void AllocaSlices::print(raw_ostream &OS, const_iterator I, StringRef Indent) const { printSlice(OS, I, Indent); + OS << "\n"; printUse(OS, I, Indent); } @@ -1029,7 +1048,7 @@ void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, StringRef Indent) const { OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" << " slice #" << (I - begin()) - << (I->isSplittable() ? " (splittable)" : "") << "\n"; + << (I->isSplittable() ? " (splittable)" : ""); } void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, @@ -1245,6 +1264,7 @@ private: friend class PHIOrSelectSpeculator; friend class AllocaSliceRewriter; + bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); bool rewritePartition(AllocaInst &AI, AllocaSlices &AS, AllocaSlices::Partition &P); bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); @@ -1794,6 +1814,27 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, return Ptr; } +/// \brief Compute the adjusted alignment for a load or store from an offset. +static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, + const DataLayout &DL) { + unsigned Alignment; + Type *Ty; + if (auto *LI = dyn_cast<LoadInst>(I)) { + Alignment = LI->getAlignment(); + Ty = LI->getType(); + } else if (auto *SI = dyn_cast<StoreInst>(I)) { + Alignment = SI->getAlignment(); + Ty = SI->getValueOperand()->getType(); + } else { + llvm_unreachable("Only loads and stores are allowed!"); + } + + if (!Alignment) + Alignment = DL.getABITypeAlignment(Ty); + + return MinAlign(Alignment, Offset); +} + /// \brief Test whether we can convert a value from the old to the new type. /// /// This predicate should be used to guard calls to convertValue in order to @@ -2412,6 +2453,7 @@ public: BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); DEBUG(AS.printSlice(dbgs(), I, "")); + DEBUG(dbgs() << "\n"); // Compute the intersecting offset range. assert(BeginOffset < NewAllocaEndOffset); @@ -3426,6 +3468,444 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, return SubTy; } +/// \brief Pre-split loads and stores to simplify rewriting. +/// +/// We want to break up the splittable load+store pairs as much as +/// possible. This is important to do as a preprocessing step, as once we +/// start rewriting the accesses to partitions of the alloca we lose the +/// necessary information to correctly split apart paired loads and stores +/// which both point into this alloca. The case to consider is something like +/// the following: +/// +/// %a = alloca [12 x i8] +/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0 +/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4 +/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8 +/// %iptr1 = bitcast i8* %gep1 to i64* +/// %iptr2 = bitcast i8* %gep2 to i64* +/// %fptr1 = bitcast i8* %gep1 to float* +/// %fptr2 = bitcast i8* %gep2 to float* +/// %fptr3 = bitcast i8* %gep3 to float* +/// store float 0.0, float* %fptr1 +/// store float 1.0, float* %fptr2 +/// %v = load i64* %iptr1 +/// store i64 %v, i64* %iptr2 +/// %f1 = load float* %fptr2 +/// %f2 = load float* %fptr3 +/// +/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and +/// promote everything so we recover the 2 SSA values that should have been +/// there all along. +/// +/// \returns true if any changes are made. +bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { + DEBUG(dbgs() << "Pre-splitting loads and stores\n"); + + // Track the loads and stores which are candidates for pre-splitting here, in + // the order they first appear during the partition scan. These give stable + // iteration order and a basis for tracking which loads and stores we + // actually split. + SmallVector<LoadInst *, 4> Loads; + SmallVector<StoreInst *, 4> Stores; + + // We need to accumulate the splits required of each load or store where we + // can find them via a direct lookup. This is important to cross-check loads + // and stores against each other. We also track the slice so that we can kill + // all the slices that end up split. + struct SplitOffsets { + Slice *S; + std::vector<uint64_t> Splits; + }; + SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap; + + DEBUG(dbgs() << " Searching for candidate loads and stores\n"); + for (auto &P : AS.partitions()) { + for (Slice &S : P) { + if (!S.isSplittable()) + continue; + if (S.endOffset() <= P.endOffset()) + continue; + assert(P.endOffset() > S.beginOffset() && + "Empty or backwards partition!"); + + // Determine if this is a pre-splittable slice. + Instruction *I = cast<Instruction>(S.getUse()->getUser()); + if (auto *LI = dyn_cast<LoadInst>(I)) { + assert(!LI->isVolatile() && "Cannot split volatile loads!"); + + // The load must be used exclusively to store into other pointers for + // us to be able to arbitrarily pre-split it. The stores must also be + // simple to avoid changing semantics. + auto IsLoadSimplyStored = [](LoadInst *LI) { + for (User *LU : LI->users()) { + auto *SI = dyn_cast<StoreInst>(LU); + if (!SI || !SI->isSimple()) + return false; + } + return true; + }; + if (!IsLoadSimplyStored(LI)) + continue; + + Loads.push_back(LI); + } else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) { + if (!SI || S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) + continue; + auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand()); + if (!StoredLoad || !StoredLoad->isSimple()) + continue; + assert(!SI->isVolatile() && "Cannot split volatile stores!"); + + Stores.push_back(SI); + } else { + // Other uses cannot be pre-split. + continue; + } + + // Record the initial split. + DEBUG(dbgs() << " Candidate: " << *I << "\n"); + auto &Offsets = SplitOffsetsMap[I]; + assert(Offsets.Splits.empty() && + "Should not have splits the first time we see an instruction!"); + Offsets.S = &S; + Offsets.Splits.push_back(P.endOffset()); + } + + // Now scan the already split slices, and add a split for any of them which + // we're going to pre-split. + for (Slice *S : P.splitSliceTails()) { + auto SplitOffsetsMapI = + SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser())); + if (SplitOffsetsMapI == SplitOffsetsMap.end()) + continue; + auto &Offsets = SplitOffsetsMapI->second; + + assert(Offsets.S == S && "Found a mismatched slice!"); + assert(!Offsets.Splits.empty() && + "Cannot have an empty set of splits on the second partition!"); + assert(Offsets.Splits.back() == P.beginOffset() && + "Previous split does not end where this one begins!"); + + // Record each split. The last partition's end isn't needed as the size + // of the slice dictates that. + if (S->endOffset() > P.endOffset()) + Offsets.Splits.push_back(P.endOffset()); + } + } + + // We may have split loads where some of their stores are split stores. For + // such loads and stores, we can only pre-split them if their splits exactly + // match relative to their starting offset. We have to verify this prior to + // any rewriting. + SmallPtrSet<LoadInst *, 4> BadSplitLoads; + Stores.erase( + std::remove_if( + Stores.begin(), Stores.end(), + [&BadSplitLoads, &SplitOffsetsMap]( + StoreInst *SI) { + // Lookup the load we are storing in our map of split offsets. + auto *LI = cast<LoadInst>(SI->getValueOperand()); + auto LoadOffsetsI = SplitOffsetsMap.find(LI); + if (LoadOffsetsI == SplitOffsetsMap.end()) + return false; // Unrelated loads are always safe. + auto &LoadOffsets = LoadOffsetsI->second; + + // Now lookup the store's offsets. + auto &StoreOffsets = SplitOffsetsMap[SI]; + + // If the relative offsets of each split in the load and store + // match exactly, then we can split them and we don't need to + // remove them here. + if (LoadOffsets.Splits == StoreOffsets.Splits) + return false; + + DEBUG(dbgs() << " Mismatched splits for load and store:\n" + << " " << *LI << "\n" + << " " << *SI << "\n"); + + // We've found a store and load that we need to split with + // mismatched relative splits. Just give up on them and remove both + // instructions from our list of candidates. + BadSplitLoads.insert(LI); + return true; + }), + Stores.end()); + Loads.erase(std::remove_if(Loads.begin(), Loads.end(), + [&BadSplitLoads](LoadInst *LI) { + return BadSplitLoads.count(LI); + }), + Loads.end()); + + // If no loads or stores are left, there is no pre-splitting to be done for + // this alloca. + if (Loads.empty() && Stores.empty()) + return false; + + // From here on, we can't fail and will be building new accesses, so rig up + // an IR builder. + IRBuilderTy IRB(&AI); + + // Collect the new slices which we will merge into the alloca slices. + SmallVector<Slice, 4> NewSlices; + + // Track any allocas we end up splitting loads and stores for so we iterate + // on them. + SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas; + + // At this point, we have collected all of the loads and stores we can + // pre-split, and the specific splits needed for them. We actually do the + // splitting in a specific order in order to handle when one of the loads in + // the value operand to one of the stores. + // + // First, we rewrite all of the split loads, and just accumulate each split + // load in a parallel structure. We also build the slices for them and append + // them to the alloca slices. + SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; + std::vector<LoadInst *> SplitLoads; + for (LoadInst *LI : Loads) { + SplitLoads.clear(); + + IntegerType *Ty = cast<IntegerType>(LI->getType()); + uint64_t LoadSize = Ty->getBitWidth() / 8; + assert(LoadSize > 0 && "Cannot have a zero-sized integer load!"); + + auto &Offsets = SplitOffsetsMap[LI]; + assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + LoadSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + + DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); + LoadInst *PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, *DL, BasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), PartPtrTy, + BasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + LI->getName()); + + // Append this load onto the list of split loads so we can find it later + // to rewrite the stores. + SplitLoads.push_back(PLoad); + + // Now build a new slice for the alloca. + NewSlices.push_back(Slice(BaseOffset + PartOffset, + BaseOffset + PartOffset + PartSize, + &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), + /*IsSplittable*/ true)); + DEBUG(AS.printSlice(dbgs(), std::prev(AS.end()), " ")); + DEBUG(dbgs() << ": " << *PLoad << "\n"); + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + if (Idx > Size) + break; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset; + } + + // Now that we have the split loads, do the slow walk over all uses of the + // load and rewrite them as split stores, or save the split loads to use + // below if the store is going to be split there anyways. + bool DeferredStores = false; + for (User *LU : LI->users()) { + StoreInst *SI = cast<StoreInst>(LU); + if (!Stores.empty() && SplitOffsetsMap.count(SI)) { + DeferredStores = true; + DEBUG(dbgs() << " Deferred splitting of store: " << *SI << "\n"); + continue; + } + + Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand()); + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + + DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); + + for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { + LoadInst *PLoad = SplitLoads[Idx]; + uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1]; + auto *PartPtrTy = PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); + + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + (void)PStore; + DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); + } + + // We want to immediately iterate on any allocas impacted by splitting + // this store, and we have to track any promotable alloca (indicated by + // a direct store) as needing to be resplit because it is no longer + // promotable. + if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) { + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( + StoreBasePtr->stripInBoundsOffsets())) { + Worklist.insert(OtherAI); + } + + // Mark the original store as dead. + DeadInsts.insert(SI); + } + + // Save the split loads if there are deferred stores among the users. + if (DeferredStores) + SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads))); + + // Mark the original load as dead and kill the original slice. + DeadInsts.insert(LI); + Offsets.S->kill(); + } + + // Second, we rewrite all of the split stores. At this point, we know that + // all loads from this alloca have been split already. For stores of such + // loads, we can simply look up the pre-existing split loads. For stores of + // other loads, we split those loads first and then write split stores of + // them. + for (StoreInst *SI : Stores) { + auto *LI = cast<LoadInst>(SI->getValueOperand()); + IntegerType *Ty = cast<IntegerType>(LI->getType()); + uint64_t StoreSize = Ty->getBitWidth() / 8; + assert(StoreSize > 0 && "Cannot have a zero-sized integer store!"); + + auto &Offsets = SplitOffsetsMap[SI]; + assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && + "Slice size should always match load size exactly!"); + uint64_t BaseOffset = Offsets.S->beginOffset(); + assert(BaseOffset + StoreSize > BaseOffset && + "Cannot represent alloca access size using 64-bit integers!"); + + Instruction *LoadBasePtr = cast<Instruction>(LI->getPointerOperand()); + Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand()); + + DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); + + // Check whether we have an already split load. + auto SplitLoadsMapI = SplitLoadsMap.find(LI); + std::vector<LoadInst *> *SplitLoads = nullptr; + if (SplitLoadsMapI != SplitLoadsMap.end()) { + SplitLoads = &SplitLoadsMapI->second; + assert(SplitLoads->size() == Offsets.Splits.size() + 1 && + "Too few split loads for the number of splits in the store!"); + } else { + DEBUG(dbgs() << " of load: " << *LI << "\n"); + } + + + uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); + int Idx = 0, Size = Offsets.Splits.size(); + for (;;) { + auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); + auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); + + // Either lookup a split load or create one. + LoadInst *PLoad; + if (SplitLoads) { + PLoad = (*SplitLoads)[Idx]; + } else { + IRB.SetInsertPoint(BasicBlock::iterator(LI)); + PLoad = IRB.CreateAlignedLoad( + getAdjustedPtr(IRB, *DL, LoadBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, LoadBasePtr->getName() + "."), + getAdjustedAlignment(LI, PartOffset, *DL), /*IsVolatile*/ false, + LI->getName()); + } + + // And store this partition. + IRB.SetInsertPoint(BasicBlock::iterator(SI)); + StoreInst *PStore = IRB.CreateAlignedStore( + PLoad, getAdjustedPtr(IRB, *DL, StoreBasePtr, + APInt(DL->getPointerSizeInBits(), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), + getAdjustedAlignment(SI, PartOffset, *DL), /*IsVolatile*/ false); + + // Now build a new slice for the alloca. + NewSlices.push_back( + Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, + &PStore->getOperandUse(PStore->getPointerOperandIndex()), + /*IsSplittable*/ true)); + DEBUG(AS.printSlice(dbgs(), std::prev(AS.end()), " ")); + DEBUG(dbgs() << ": " << *PStore << "\n"); + if (!SplitLoads) { + DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); + } + + // Setup the next partition. + PartOffset = Offsets.Splits[Idx]; + ++Idx; + if (Idx > Size) + break; + PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset; + } + + // We want to immediately iterate on any allocas impacted by splitting + // this load, which is only relevant if it isn't a load of this alloca and + // thus we didn't already split the loads above. We also have to keep track + // of any promotable allocas we split loads on as they can no longer be + // promoted. + if (!SplitLoads) { + if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + ResplitPromotableAllocas.insert(OtherAI); + Worklist.insert(OtherAI); + } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( + LoadBasePtr->stripInBoundsOffsets())) { + assert(OtherAI != &AI && "We can't re-split our own alloca!"); + Worklist.insert(OtherAI); + } + } + + // Mark the original store as dead now that we've split it up and kill its + // slice. Note that we leave the original load in place. It may in turn be + // split up if it is an alloca load for some other alloca, but it may be + // a normal load. This may introduce redundant loads, but where those can + // be merged the rest of the optimizer should handle the merging, and this + // uncovers SSA splits which is more important. In practice, the original + // loads will almost always be fully split and removed eventually, and the + // splits will be merged by any trivial CSE, including instcombine. + DeadInsts.insert(SI); + Offsets.S->kill(); + } + + // Now we need to remove the killed slices, sort the newly added slices, and + // merge the two sorted ranges of slices so that the entire range is sorted + // properly for us to re-compute the partitions. + AS.erase(std::remove_if(AS.begin(), AS.end(), [](const Slice &S) { + return S.isDead(); + }), AS.end()); + + AS.insert(NewSlices); + + DEBUG(dbgs() << " Pre-split slices:\n"); +#ifndef NDEBUG + for (auto I = AS.begin(), E = AS.end(); I != E; ++I) + DEBUG(AS.print(dbgs(), I, " ")); +#endif + + // Finally, don't try to promote any allocas that new require re-splitting. + // They have already been added to the worklist above. + PromotableAllocas.erase( + std::remove_if( + PromotableAllocas.begin(), PromotableAllocas.end(), + [&](AllocaInst *AI) { return ResplitPromotableAllocas.count(AI); }), + PromotableAllocas.end()); + + return true; +} + /// \brief Rewrite an alloca partition's users. /// /// This routine drives both of the rewriting goals of the SROA pass. It tries @@ -3582,7 +4062,9 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { unsigned NumPartitions = 0; bool Changed = false; - // Rewrite each parttion. + Changed |= presplitLoadsAndStores(AI, AS); + + // Rewrite each partition. for (auto &P : AS.partitions()) { Changed |= rewritePartition(AI, AS, P); ++NumPartitions; diff --git a/llvm/test/Transforms/SROA/basictest.ll b/llvm/test/Transforms/SROA/basictest.ll index dc2b16550a0..914095dc383 100644 --- a/llvm/test/Transforms/SROA/basictest.ll +++ b/llvm/test/Transforms/SROA/basictest.ll @@ -572,8 +572,7 @@ bad: } define i8 @test12() { -; We fully promote these to the i24 load or store size, resulting in just masks -; and other operations that instcombine will fold, but no alloca. +; We promote these to three SSA values which fold away immediately. ; ; CHECK-LABEL: @test12( @@ -592,17 +591,6 @@ entry: %ai = load i24* %aiptr ; CHECK-NOT: store ; CHECK-NOT: load -; CHECK: %[[ext2:.*]] = zext i8 0 to i24 -; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16 -; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535 -; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]] -; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24 -; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8 -; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281 -; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]] -; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24 -; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256 -; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]] %biptr = bitcast [3 x i8]* %b to i24* store i24 %ai, i24* %biptr @@ -614,17 +602,12 @@ entry: %b2 = load i8* %b2ptr ; CHECK-NOT: store ; CHECK-NOT: load -; CHECK: %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8 -; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8 -; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8 -; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16 -; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8 %bsum0 = add i8 %b0, %b1 %bsum1 = add i8 %bsum0, %b2 ret i8 %bsum1 -; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]] -; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]] +; CHECK: %[[sum0:.*]] = add i8 0, 0 +; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], 0 ; CHECK-NEXT: ret i8 %[[sum1]] } @@ -1440,3 +1423,36 @@ entry: ret void } +define float @test25() { +; Check that we split up stores in order to promote the smaller SSA values.. These types +; of patterns can arise because LLVM maps small memcpy's to integer load and +; stores. If we get a memcpy of an aggregate (such as C and C++ frontends would +; produce, but so might any language frontend), this will in many cases turn into +; an integer load and store. SROA needs to be extremely powerful to correctly +; handle these cases and form splitable and promotable SSA values. +; +; CHECK-LABEL: @test25( +; CHECK-NOT: alloca +; CHECK: %[[F1:.*]] = bitcast i32 0 to float +; CHECK: %[[F2:.*]] = bitcast i32 1065353216 to float +; CHECK: %[[SUM:.*]] = fadd float %[[F1]], %[[F2]] +; CHECK: ret float %[[SUM]] + +entry: + %a = alloca i64 + %b = alloca i64 + %a.cast = bitcast i64* %a to [2 x float]* + %a.gep1 = getelementptr [2 x float]* %a.cast, i32 0, i32 0 + %a.gep2 = getelementptr [2 x float]* %a.cast, i32 0, i32 1 + %b.cast = bitcast i64* %b to [2 x float]* + %b.gep1 = getelementptr [2 x float]* %b.cast, i32 0, i32 0 + %b.gep2 = getelementptr [2 x float]* %b.cast, i32 0, i32 1 + store float 0.0, float* %a.gep1 + store float 1.0, float* %a.gep2 + %v = load i64* %a + store i64 %v, i64* %b + %f1 = load float* %b.gep1 + %f2 = load float* %b.gep2 + %ret = fadd float %f1, %f2 + ret float %ret +} diff --git a/llvm/test/Transforms/SROA/big-endian.ll b/llvm/test/Transforms/SROA/big-endian.ll index 9e87a9f073c..1ef074844d1 100644 --- a/llvm/test/Transforms/SROA/big-endian.ll +++ b/llvm/test/Transforms/SROA/big-endian.ll @@ -3,65 +3,6 @@ target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" -define i8 @test1() { -; We fully promote these to the i24 load or store size, resulting in just masks -; and other operations that instcombine will fold, but no alloca. Note this is -; the same as test12 in basictest.ll, but here we assert big-endian byte -; ordering. -; -; CHECK-LABEL: @test1( - -entry: - %a = alloca [3 x i8] - %b = alloca [3 x i8] -; CHECK-NOT: alloca - - %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0 - store i8 0, i8* %a0ptr - %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1 - store i8 0, i8* %a1ptr - %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2 - store i8 0, i8* %a2ptr - %aiptr = bitcast [3 x i8]* %a to i24* - %ai = load i24* %aiptr -; CHECK-NOT: store -; CHECK-NOT: load -; CHECK: %[[ext2:.*]] = zext i8 0 to i24 -; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256 -; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]] -; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24 -; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8 -; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281 -; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]] -; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24 -; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16 -; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535 -; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]] - - %biptr = bitcast [3 x i8]* %b to i24* - store i24 %ai, i24* %biptr - %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0 - %b0 = load i8* %b0ptr - %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1 - %b1 = load i8* %b1ptr - %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2 - %b2 = load i8* %b2ptr -; CHECK-NOT: store -; CHECK-NOT: load -; CHECK: %[[shift0:.*]] = lshr i24 %[[insert0]], 16 -; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8 -; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8 -; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8 -; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8 - - %bsum0 = add i8 %b0, %b1 - %bsum1 = add i8 %bsum0, %b2 - ret i8 %bsum1 -; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]] -; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]] -; CHECK-NEXT: ret i8 %[[sum1]] -} - define i64 @test2() { ; Test for various mixed sizes of integer loads and stores all getting ; promoted. |