6 files changed, 185 insertions, 55 deletions
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 57cf37fa2c6..a92ba8e374f 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -221,7 +221,8 @@ class Value;
   /// return the i8 value that it is represented with. This is true for all i8
   /// values obviously, but is also true for i32 0, i32 -1, i16 0xF0F0, double
   /// 0.0 etc. If the value can't be handled with a repeated byte store (e.g.
-  /// i16 0x1234), return null.
+  /// i16 0x1234), return null. If the value is entirely undef and padding,
+  /// return undef.
   Value *isBytewiseValue(Value *V);
 
   /// Given an aggregrate and an sequence of indices, see if the scalar value
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 122fa72d773..495324ff534 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3042,62 +3042,92 @@ bool llvm::isKnownNeverNaN(const Value *V, const TargetLibraryInfo *TLI,
   return true;
 }
 
-/// If the specified value can be set by repeating the same byte in memory,
-/// return the i8 value that it is represented with.  This is
-/// true for all i8 values obviously, but is also true for i32 0, i32 -1,
-/// i16 0xF0F0, double 0.0 etc.  If the value can't be handled with a repeated
-/// byte store (e.g. i16 0x1234), return null.
 Value *llvm::isBytewiseValue(Value *V) {
+
   // All byte-wide stores are splatable, even of arbitrary variables.
-  if (V->getType()->isIntegerTy(8)) return V;
+  if (V->getType()->isIntegerTy(8))
+    return V;
+
+  LLVMContext &Ctx = V->getContext();
+
+  // Undef don't care.
+  auto *UndefInt8 = UndefValue::get(Type::getInt8Ty(Ctx));
+  if (isa<UndefValue>(V))
+    return UndefInt8;
+
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C) {
+    // Conceptually, we could handle things like:
+    //   %a = zext i8 %X to i16
+    //   %b = shl i16 %a, 8
+    //   %c = or i16 %a, %b
+    // but until there is an example that actually needs this, it doesn't seem
+    // worth worrying about.
+    return nullptr;
+  }
 
   // Handle 'null' ConstantArrayZero etc.
-  if (Constant *C = dyn_cast<Constant>(V))
-    if (C->isNullValue())
-      return Constant::getNullValue(Type::getInt8Ty(V->getContext()));
+  if (C->isNullValue())
+    return Constant::getNullValue(Type::getInt8Ty(Ctx));
 
-  // Constant float and double values can be handled as integer values if the
+  // Constant floating-point values can be handled as integer values if the
   // corresponding integer value is "byteable".  An important case is 0.0.
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
-    if (CFP->getType()->isFloatTy())
-      V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(V->getContext()));
-    if (CFP->getType()->isDoubleTy())
-      V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(V->getContext()));
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+    Type *Ty = nullptr;
+    if (CFP->getType()->isHalfTy())
+      Ty = Type::getInt16Ty(Ctx);
+    else if (CFP->getType()->isFloatTy())
+      Ty = Type::getInt32Ty(Ctx);
+    else if (CFP->getType()->isDoubleTy())
+      Ty = Type::getInt64Ty(Ctx);
     // Don't handle long double formats, which have strange constraints.
+    return Ty ? isBytewiseValue(ConstantExpr::getBitCast(CFP, Ty)) : nullptr;
   }
 
   // We can handle constant integers that are multiple of 8 bits.
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
     if (CI->getBitWidth() % 8 == 0) {
       assert(CI->getBitWidth() > 8 && "8 bits should be handled above!");
-
       if (!CI->getValue().isSplat(8))
         return nullptr;
-      return ConstantInt::get(V->getContext(), CI->getValue().trunc(8));
+      return ConstantInt::get(Ctx, CI->getValue().trunc(8));
     }
   }
 
-  // A ConstantDataArray/Vector is splatable if all its members are equal and
-  // also splatable.
-  if (ConstantDataSequential *CA = dyn_cast<ConstantDataSequential>(V)) {
-    Value *Elt = CA->getElementAsConstant(0);
-    Value *Val = isBytewiseValue(Elt);
-    if (!Val)
+  auto Merge = [&](Value *LHS, Value *RHS) -> Value * {
+    if (LHS == RHS)
+      return LHS;
+    if (!LHS || !RHS)
       return nullptr;
+    if (LHS == UndefInt8)
+      return RHS;
+    if (RHS == UndefInt8)
+      return LHS;
+    return nullptr;
+  };
 
-    for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I)
-      if (CA->getElementAsConstant(I) != Elt)
+  if (ConstantDataSequential *CA = dyn_cast<ConstantDataSequential>(C)) {
+    Value *Val = UndefInt8;
+    for (unsigned I = 0, E = CA->getNumElements(); I != E; ++I)
+      if (!(Val = Merge(Val, isBytewiseValue(CA->getElementAsConstant(I)))))
         return nullptr;
+    return Val;
+  }
+
+  if (isa<ConstantVector>(C)) {
+    Constant *Splat = cast<ConstantVector>(C)->getSplatValue();
+    return Splat ? isBytewiseValue(Splat) : nullptr;
+  }
 
+  if (isa<ConstantArray>(C) || isa<ConstantStruct>(C)) {
+    Value *Val = UndefInt8;
+    for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I)
+      if (!(Val = Merge(Val, isBytewiseValue(C->getOperand(I)))))
+        return nullptr;
     return Val;
   }
 
-  // Conceptually, we could handle things like:
-  //   %a = zext i8 %X to i16
-  //   %b = shl i16 %a, 8
-  //   %c = or i16 %a, %b
-  // but until there is an example that actually needs this, it doesn't seem
-  // worth worrying about.
+  // Don't try to handle the handful of other constants.
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 6d73091592e..68abf9719a9 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -348,6 +348,9 @@ static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
 /// just replicate their input array and then pass on to memset_pattern16.
 static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
+  // FIXME: This could check for UndefValue because it can be merged into any
+  // other valid pattern.
+
   // If the value isn't a constant, we can't promote it to being in a constant
   // array.  We could theoretically do a store to an alloca or something, but
   // that doesn't seem worthwhile.
@@ -645,9 +648,13 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
 
       if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
         if (For == ForMemset::Yes) {
+          if (isa<UndefValue>(FirstSplatValue))
+            FirstSplatValue = SecondSplatValue;
           if (FirstSplatValue != SecondSplatValue)
             continue;
         } else {
+          if (isa<UndefValue>(FirstPatternValue))
+            FirstPatternValue = SecondPatternValue;
           if (FirstPatternValue != SecondPatternValue)
             continue;
         }
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 6e858d61fef..4e82e2bd42c 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -413,7 +413,10 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
       if (!NextStore->isSimple()) break;
 
       // Check to see if this stored value is of the same byte-splattable value.
-      if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
+      Value *StoredByte = isBytewiseValue(NextStore->getOperand(0));
+      if (isa<UndefValue>(ByteVal) && StoredByte)
+        ByteVal = StoredByte;
+      if (ByteVal != StoredByte)
         break;
 
       // Check to see if this store is to a constant offset from the start ptr.
diff --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
index 0215431ac35..6ce1aee338d 100644
--- a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
@@ -73,13 +73,16 @@ define void @copyalias(%S* %src, %S* %dst) {
   ret void
 }
 
-; If the store address is computed ina complex manner, make
+; If the store address is computed in a complex manner, make
 ; sure we lift the computation as well if needed and possible.
 define void @addrproducer(%S* %src, %S* %dst) {
-; CHECK-LABEL: addrproducer
-; CHECK: %dst2 = getelementptr %S, %S* %dst, i64 1
-; CHECK: call void @llvm.memmove.p0i8.p0i8.i64
-; CHECK-NEXT: store %S undef, %S* %dst
+; CHECK-LABEL: addrproducer(
+; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8*
+; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i64 1
+; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8*
+; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8*
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST]], i64 16, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false)
 ; CHECK-NEXT: ret void
   %1 = load %S, %S* %src
   store %S undef, %S* %dst
@@ -89,7 +92,14 @@ define void @addrproducer(%S* %src, %S* %dst) {
 }
 
 define void @aliasaddrproducer(%S* %src, %S* %dst, i32* %dstidptr) {
-; CHECK-LABEL: aliasaddrproducer
+; CHECK-LABEL: aliasaddrproducer(
+; CHECK-NEXT: %[[SRC:[0-9]+]] = load %S, %S* %src
+; CHECK-NEXT: %[[DSTCAST:[0-9]+]] = bitcast %S* %dst to i8*
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[DSTCAST]], i8 undef, i64 16, i1 false)
+; CHECK-NEXT: %dstindex = load i32, i32* %dstidptr
+; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex
+; CHECK-NEXT: store %S %[[SRC]], %S* %dst2
+; CHECK-NEXT: ret void
   %1 = load %S, %S* %src
   store %S undef, %S* %dst
   %dstindex = load i32, i32* %dstidptr
@@ -99,7 +109,16 @@ define void @aliasaddrproducer(%S* %src, %S* %dst, i32* %dstidptr) {
 }
 
 define void @noaliasaddrproducer(%S* %src, %S* noalias %dst, i32* noalias %dstidptr) {
-; CHECK-LABEL: noaliasaddrproducer
+; CHECK-LABEL: noaliasaddrproducer(
+; CHECK-NEXT: %[[SRCCAST:[0-9]+]] = bitcast %S* %src to i8*
+; CHECK-NEXT: %[[LOADED:[0-9]+]] = load i32, i32* %dstidptr
+; CHECK-NEXT: %dstindex = or i32 %[[LOADED]], 1
+; CHECK-NEXT: %dst2 = getelementptr %S, %S* %dst, i32 %dstindex
+; CHECK-NEXT: %[[DST2CAST:[0-9]+]] = bitcast %S* %dst2 to i8*
+; CHECK-NEXT: %[[SRCCAST2:[0-9]+]] = bitcast %S* %src to i8*
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[DST2CAST]], i8* align 8 %[[SRCCAST2]], i64 16, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[SRCCAST]], i8 undef, i64 16, i1 false)
+; CHECK-NEXT: ret void
   %1 = load %S, %S* %src
   store %S undef, %S* %src
   %2 = load i32, i32* %dstidptr
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll
index 652d1c1d65b..1424ca3709c 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-to-memset.ll
@@ -1,19 +1,89 @@
 ; RUN: opt -memcpyopt -S < %s | FileCheck %s
 
-@cst = internal constant [3 x i32] [i32 -1, i32 -1, i32 -1], align 4
-
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
-declare void @foo(i32*) nounwind
-
-define void @test1() nounwind {
-  %arr = alloca [3 x i32], align 4
-  %arr_i8 = bitcast [3 x i32]* %arr to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %arr_i8, i8* align 4 bitcast ([3 x i32]* @cst to i8*), i64 12, i1 false)
-  %arraydecay = getelementptr inbounds [3 x i32], [3 x i32]* %arr, i64 0, i64 0
-  call void @foo(i32* %arraydecay) nounwind
+
+@undef = internal constant i32 undef, align 4
+define void @test_undef() nounwind {
+  %a = alloca i32, align 4
+  %i8 = bitcast i32* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (i32* @undef to i8*), i64 4, i1 false)
+  ret void
+; CHECK-LABEL: @test_undef(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+@i32x3 = internal constant [3 x i32] [i32 -1, i32 -1, i32 -1], align 4
+define void @test_i32x3() nounwind {
+  %a = alloca [3 x i32], align 4
+  %i8 = bitcast [3 x i32]* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast ([3 x i32]* @i32x3 to i8*), i64 12, i1 false)
+  ret void
+; CHECK-LABEL: @test_i32x3(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+@i32x3_undef = internal constant [3 x i32] [i32 -1, i32 undef, i32 -1], align 4
+define void @test_i32x3_undef() nounwind {
+  %a = alloca [3 x i32], align 4
+  %i8 = bitcast [3 x i32]* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast ([3 x i32]* @i32x3_undef to i8*), i64 12, i1 false)
+  ret void
+; CHECK-LABEL: @test_i32x3_undef(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+%struct.bitfield = type { i8, [3 x i8] }
+@bitfield = private unnamed_addr constant %struct.bitfield { i8 -86, [3 x i8] [i8 -86, i8 -86, i8 -86] }, align 4
+define void @test_bitfield() nounwind {
+  %a = alloca %struct.bitfield, align 4
+  %i8 = bitcast %struct.bitfield* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (%struct.bitfield* @bitfield to i8*), i64 4, i1 false)
+  ret void
+; CHECK-LABEL: @test_bitfield(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+@i1x16_zero = internal constant <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, align 4
+define void @test_i1x16_zero() nounwind {
+  %a = alloca <16 x i1>, align 4
+  %i8 = bitcast <16 x i1>* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (<16 x i1>* @i1x16_zero to i8*), i64 16, i1 false)
+  ret void
+; CHECK-LABEL: @test_i1x16_zero(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+; i1 isn't currently handled. Should it?
+@i1x16_one = internal constant <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, align 4
+define void @test_i1x16_one() nounwind {
+  %a = alloca <16 x i1>, align 4
+  %i8 = bitcast <16 x i1>* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (<16 x i1>* @i1x16_one to i8*), i64 16, i1 false)
+  ret void
+; CHECK-LABEL: @test_i1x16_one(
+; CHECK-NOT:   call void @llvm.memset
+; CHECK:      call void @llvm.memcpy
+; CHECK:       ret void
+}
+
+@half = internal constant half 0xH0000, align 4
+define void @test_half() nounwind {
+  %a = alloca half, align 4
+  %i8 = bitcast half* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %i8, i8* align 4 bitcast (half* @half to i8*), i64 2, i1 false)
   ret void
-; CHECK-LABEL: @test1(
-; CHECK: call void @llvm.memset
-; CHECK-NOT: call void @llvm.memcpy
-; CHECK: ret void
+; CHECK-LABEL: @test_half(
+; CHECK:       call void @llvm.memset
+; CHECK-NOT:   call void @llvm.memcpy
+; CHECK:       ret void
 }