2 files changed, 80 insertions, 4 deletions
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 0451f774378..9ef5d81a66d 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -756,8 +756,8 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
   MSIs.insert(MSI);
   bool NegStride = SizeInBytes == -Stride;
   return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
-                                 MSI->getAlignment(), SplatValue, MSI, MSIs, Ev,
-                                 BECount, NegStride, /*IsLoopMemset=*/true);
+                                 MSI->getDestAlignment(), SplatValue, MSI, MSIs,
+                                 Ev, BECount, NegStride, /*IsLoopMemset=*/true);
 }
 
 /// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -1037,16 +1037,17 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
 
-  unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
   CallInst *NewCall = nullptr;
   // Check whether to generate an unordered atomic memcpy:
   //  If the load or store are atomic, then they must neccessarily be unordered
   //  by previous checks.
   if (!SI->isAtomic() && !LI->isAtomic())
-    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(),
+                                   LoadBasePtr, LI->getAlignment(), NumBytes);
   else {
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
+    unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
     if (Align < StoreSize)
       return false;
 
diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll
index e16817fe364..7c491b357c7 100644
--- a/llvm/test/Transforms/LoopIdiom/basic.ll
+++ b/llvm/test/Transforms/LoopIdiom/basic.ll
@@ -28,6 +28,29 @@ for.end:                                          ; preds = %for.body, %entry
 ; CHECK-NOT: store
 }
 
+; Make sure memset is formed for larger than 1 byte stores, and that the
+; alignment of the store is preserved
+define void @test1_i16(i16* align 2 %Base, i64 %Size) nounwind ssp {
+bb.nph:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i16, i16* %Base, i64 %indvar
+  store i16 0, i16* %I.0.014, align 2
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test1_i16(
+; CHECK: %[[BaseBC:.*]] = bitcast i16* %Base to i8*
+; CHECK: %[[Sz:[0-9]+]] = shl i64 %Size, 1
+; CHECK: call void @llvm.memset.p0i8.i64(i8* align 2 %[[BaseBC]], i8 0, i64 %[[Sz]], i1 false)
+; CHECK-NOT: store
+}
+
 ; This is a loop that was rotated but where the blocks weren't merged.  This
 ; shouldn't perturb us.
 define void @test1a(i8* %Base, i64 %Size) nounwind ssp {
@@ -169,6 +192,58 @@ for.end:                                          ; preds = %for.body, %entry
 ; CHECK: ret void
 }
 
+;; memcpy formation, check alignment
+define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp {
+bb.nph:
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load i32, i32* %I.0.014, align 1
+  store i32 %V, i32* %DestI, align 4
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test6_dest_align(
+; CHECK: %[[Dst:.*]] = bitcast i32* %Dest to i8*
+; CHECK: %[[Src:.*]] = bitcast i32* %Base to i8*
+; CHECK: %[[Sz:[0-9]+]] = shl i64 %Size, 2
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[Dst]], i8* align 1 %[[Src]], i64 %[[Sz]], i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
+;; memcpy formation, check alignment
+define void @test6_src_align(i32* noalias align 4 %Base, i32* noalias align 1 %Dest, i64 %Size) nounwind ssp {
+bb.nph:
+  br label %for.body
+
+for.body:                                         ; preds = %bb.nph, %for.body
+  %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
+  %I.0.014 = getelementptr i32, i32* %Base, i64 %indvar
+  %DestI = getelementptr i32, i32* %Dest, i64 %indvar
+  %V = load i32, i32* %I.0.014, align 4
+  store i32 %V, i32* %DestI, align 1
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp eq i64 %indvar.next, %Size
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+; CHECK-LABEL: @test6_src_align(
+; CHECK: %[[Dst]] = bitcast i32* %Dest to i8*
+; CHECK: %[[Src]] = bitcast i32* %Base to i8*
+; CHECK: %[[Sz:[0-9]+]] = shl i64 %Size, 2
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %[[Dst]], i8* align 4 %[[Src]], i64 %[[Sz]], i1 false)
+; CHECK-NOT: store
+; CHECK: ret void
+}
+
 
 ; This is a loop that was rotated but where the blocks weren't merged.  This
 ; shouldn't perturb us.