summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChandler Carruth <chandlerc@gmail.com>2012-12-17 04:07:37 +0000
committerChandler Carruth <chandlerc@gmail.com>2012-12-17 04:07:37 +0000
commitccca504f3a08a47f7852fe9a8f98e2f3c68d8b1f (patch)
treed4adaf06c33a882f2728ec40ba47356f1aab9a94
parenteae65a56294eddee7296fe2f9ef64de5066c52dc (diff)
downloadbcm5719-llvm-ccca504f3a08a47f7852fe9a8f98e2f3c68d8b1f.tar.gz
bcm5719-llvm-ccca504f3a08a47f7852fe9a8f98e2f3c68d8b1f.zip
Fix the first part of PR14478: memset now works.
PR14478 highlights a serious problem in SROA that simply wasn't being exercised due to a lack of vector input code mixed with C-library function calls. Part of SROA was written carefully to handle subvector accesses via memset and memcpy, but the rewriter never grew support for this. Fixing it required refactoring the subvector access code in other parts of SROA so it could be shared, and then fixing the splat formation logic and using subvector insertion (this patch). The PR isn't quite fixed yet, as memcpy is still broken in the same way. I'm starting on that series of patches now. Hopefully this will be enough to bring the bullet benchmark back to life with the bb-vectorizer enabled, but that may require fixing memcpy as well. llvm-svn: 170301
-rw-r--r--llvm/lib/Transforms/Scalar/SROA.cpp102
-rw-r--r--llvm/test/Transforms/SROA/vector-promotion.ll35
2 files changed, 103 insertions, 34 deletions
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 5b11579f66b..a30602889d5 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2678,6 +2678,24 @@ private:
return V;
}
+ /// \brief Compute a vector splat for a given element value.
+ Value *getVectorSplat(IRBuilder<> &IRB, Value *V, unsigned NumElements) {
+ assert(NumElements > 0 && "Cannot splat to an empty vector.");
+
+ // First insert it into a one-element vector so we can shuffle it. It is
+ // really silly that LLVM's IR requires this in order to form a splat.
+ Value *Undef = UndefValue::get(VectorType::get(V->getType(), 1));
+ V = IRB.CreateInsertElement(Undef, V, IRB.getInt32(0),
+ getName(".splatinsert"));
+
+ // Shuffle the value across the desired number of elements.
+ SmallVector<Constant*, 8> Mask(NumElements, IRB.getInt32(0));
+ V = IRB.CreateShuffleVector(V, Undef, ConstantVector::get(Mask),
+ getName(".splat"));
+ DEBUG(dbgs() << " splat: " << *V << "\n");
+ return V;
+ }
+
bool visitMemSetInst(MemSetInst &II) {
DEBUG(dbgs() << " original: " << II << "\n");
IRBuilder<> IRB(&II);
@@ -2706,7 +2724,8 @@ private:
(BeginOffset != NewAllocaBeginOffset ||
EndOffset != NewAllocaEndOffset ||
!AllocaTy->isSingleValueType() ||
- !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) {
+ !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) ||
+ TD.getTypeSizeInBits(ScalarTy)%8 != 0)) {
Type *SizeTy = II.getLength()->getType();
Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
CallInst *New
@@ -2722,45 +2741,60 @@ private:
// If we can represent this as a simple value, we have to build the actual
// value to store, which requires expanding the byte present in memset to
// a sensible representation for the alloca type. This is essentially
- // splatting the byte to a sufficiently wide integer, bitcasting to the
- // desired scalar type, and splatting it across any desired vector type.
+ // splatting the byte to a sufficiently wide integer, splatting it across
+ // any desired vector width, and bitcasting to the final type.
uint64_t Size = EndOffset - BeginOffset;
Value *V = getIntegerSplat(IRB, II.getValue(), Size);
- // If this is an element-wide memset of a vectorizable alloca, insert it.
- if (VecTy && (BeginOffset > NewAllocaBeginOffset ||
- EndOffset < NewAllocaEndOffset)) {
- if (V->getType() != ScalarTy)
- V = convertValue(TD, IRB, V, ScalarTy);
- StoreInst *Store = IRB.CreateAlignedStore(
- IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI,
- NewAI.getAlignment(),
- getName(".load")),
- V, IRB.getInt32(getIndex(BeginOffset)),
- getName(".insert")),
- &NewAI, NewAI.getAlignment());
- (void)Store;
- DEBUG(dbgs() << " to: " << *Store << "\n");
- return true;
- }
-
- // If this is a memset on an alloca where we can widen stores, insert the
- // set integer.
- if (IntTy && (BeginOffset > NewAllocaBeginOffset ||
- EndOffset < NewAllocaEndOffset)) {
+ if (VecTy) {
+ // If this is a memset of a vectorized alloca, insert it.
+ assert(ElementTy == ScalarTy);
+
+ unsigned BeginIndex = getIndex(BeginOffset);
+ unsigned EndIndex = getIndex(EndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+ Value *Splat = getIntegerSplat(IRB, II.getValue(),
+ TD.getTypeSizeInBits(ElementTy)/8);
+ if (NumElements > 1)
+ Splat = getVectorSplat(IRB, Splat, NumElements);
+
+ V = insertVector(IRB, Splat, BeginIndex, EndIndex);
+ } else if (IntTy) {
+ // If this is a memset on an alloca where we can widen stores, insert the
+ // set integer.
assert(!II.isVolatile());
- Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
- getName(".oldload"));
- Old = convertValue(TD, IRB, Old, IntTy);
- assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
- uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
- V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
- }
- if (V->getType() != AllocaTy)
- V = convertValue(TD, IRB, V, AllocaTy);
+ V = getIntegerSplat(IRB, II.getValue(), Size);
+
+ if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+ EndOffset != NewAllocaBeginOffset)) {
+ Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+ getName(".oldload"));
+ Old = convertValue(TD, IRB, Old, IntTy);
+ assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+ uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+ V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
+ } else {
+ assert(V->getType() == IntTy &&
+ "Wrong type for an alloca wide integer!");
+ }
+ } else {
+ // Established these invariants above.
+ assert(BeginOffset == NewAllocaBeginOffset);
+ assert(EndOffset == NewAllocaEndOffset);
+
+ V = getIntegerSplat(IRB, II.getValue(),
+ TD.getTypeSizeInBits(ScalarTy)/8);
+
+ if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+ V = getVectorSplat(IRB, V, AllocaVecTy->getNumElements());
+ }
- Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+ Value *New = IRB.CreateAlignedStore(convertValue(TD, IRB, V, AllocaTy),
+ &NewAI, NewAI.getAlignment(),
II.isVolatile());
(void)New;
DEBUG(dbgs() << " to: " << *New << "\n");
diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index bb34e3f084a..f957fef6dd0 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -279,6 +279,41 @@ entry:
; CHECK-NEXT: ret <4 x i32> %[[ret]]
}
+declare void @llvm.memset.p0i32.i32(i32* nocapture, i32, i32, i32, i1) nounwind
+
+define <4 x i32> @test_subvec_memset() {
+; CHECK: @test_subvec_memset
+entry:
+ %a = alloca <4 x i32>
+; CHECK-NOT: alloca
+
+ %a.gep0 = getelementptr <4 x i32>* %a, i32 0, i32 0
+ %a.cast0 = bitcast i32* %a.gep0 to i8*
+ call void @llvm.memset.p0i8.i32(i8* %a.cast0, i8 0, i32 8, i32 0, i1 false)
+; CHECK-NOT: store
+; CHECK: %[[insert1:.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, {{.*}}>
+
+ %a.gep1 = getelementptr <4 x i32>* %a, i32 0, i32 1
+ %a.cast1 = bitcast i32* %a.gep1 to i8*
+ call void @llvm.memset.p0i8.i32(i8* %a.cast1, i8 1, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[insert2:.*]] = shufflevector <4 x i32> <i32 undef, i32 16843009, i32 16843009, i32 undef>, <4 x i32> %[[insert1]], <4 x i32> <i32 4, i32 1, i32 2, {{.*}}>
+
+ %a.gep2 = getelementptr <4 x i32>* %a, i32 0, i32 2
+ %a.cast2 = bitcast i32* %a.gep2 to i8*
+ call void @llvm.memset.p0i8.i32(i8* %a.cast2, i8 3, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[insert3:.*]] = shufflevector <4 x i32> <i32 undef, i32 undef, i32 50529027, i32 50529027>, <4 x i32> %[[insert2]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+
+ %a.gep3 = getelementptr <4 x i32>* %a, i32 0, i32 3
+ %a.cast3 = bitcast i32* %a.gep3 to i8*
+ call void @llvm.memset.p0i8.i32(i8* %a.cast3, i8 7, i32 4, i32 0, i1 false)
+; CHECK-NEXT: %[[insert4:.*]] = insertelement <4 x i32> %[[insert3]], i32 117901063, i32 3
+
+ %ret = load <4 x i32>* %a
+
+ ret <4 x i32> %ret
+; CHECK-NEXT: ret <4 x i32> %[[insert4]]
+}
+
define i32 @PR14212() {
; CHECK: @PR14212
; This caused a crash when "splitting" the load of the i32 in order to promote
OpenPOWER on IntegriCloud