5 files changed, 41 insertions, 81 deletions
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index ebe8872e5de..3d6f4700255 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1904,7 +1904,8 @@ public:
   /// This may be true if the target does not directly support the
   /// multiplication operation for the specified type or the sequence of simpler
   /// ops is faster than the multiply.
-  virtual bool decomposeMulByConstant(EVT VT, SDValue C) const {
+  virtual bool decomposeMulByConstant(LLVMContext &Context,
+                                      EVT VT, SDValue C) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 185a9a72fa3..9d8850b59f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3556,7 +3556,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   //           x * 15 --> (x << 4) - x
   //           x * -33 --> -((x << 5) + x)
   //           x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
-  if (N1IsConst && TLI.decomposeMulByConstant(VT, N1)) {
+  if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
     // TODO: We could handle more general decomposition of any constant by
     //       having the target set a limit on number of ops and making a
     //       callback to determine that sequence (similar to sqrt expansion).
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4b8987b0a02..46b31894df7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4869,15 +4869,25 @@ bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
   return true;
 }
 
-bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
+bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                                               SDValue C) const {
   // TODO: We handle scalars using custom code, but generic combining could make
   // that unnecessary.
   APInt MulC;
   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
     return false;
 
+  // Find the type this will be legalized too. Otherwise we might prematurely
+  // convert this to shl+add/sub and then still have to type legalize those ops.
+  // Another choice would be to defer the decision for illegal types until 
+  // after type legalization. But constant splat vectors of i64 can't make it
+  // through type legalization on 32-bit targets so we would need to special
+  // case vXi64.
+  while (getTypeAction(Context, VT) != TypeLegal)
+    VT = getTypeToTransformTo(Context, VT);
+
   // If vector multiply is legal, assume that's faster than shl + add/sub.
-  // TODO: Multiply is a complex op with higher latency and lower througput in
+  // TODO: Multiply is a complex op with higher latency and lower throughput in
   //       most implementations, so this check could be loosened based on type
   //       and/or a CPU attribute.
   if (isOperationLegal(ISD::MUL, VT))
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 625b42d3515..8dc58a188dd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1105,7 +1105,8 @@ namespace llvm {
 
     bool convertSelectOfConstantsToMath(EVT VT) const override;
 
-    bool decomposeMulByConstant(EVT VT, SDValue C) const override;
+    bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                                SDValue C) const override;
 
     bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
                                   bool IsSigned) const override;
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 1377d1ce920..805ff9f69ed 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -435,26 +435,16 @@ define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind {
 define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
 ; X86-LABEL: mul_v8i32_17:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    pslld $4, %xmm2
-; X86-NEXT:    paddd %xmm0, %xmm2
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    pslld $4, %xmm3
-; X86-NEXT:    paddd %xmm1, %xmm3
-; X86-NEXT:    movdqa %xmm2, %xmm0
-; X86-NEXT:    movdqa %xmm3, %xmm1
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [17,17,17,17]
+; X86-NEXT:    pmulld %xmm2, %xmm0
+; X86-NEXT:    pmulld %xmm2, %xmm1
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_v8i32_17:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa %xmm0, %xmm2
-; X64-NEXT:    pslld $4, %xmm2
-; X64-NEXT:    paddd %xmm0, %xmm2
-; X64-NEXT:    movdqa %xmm1, %xmm3
-; X64-NEXT:    pslld $4, %xmm3
-; X64-NEXT:    paddd %xmm1, %xmm3
-; X64-NEXT:    movdqa %xmm2, %xmm0
-; X64-NEXT:    movdqa %xmm3, %xmm1
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [17,17,17,17]
+; X64-NEXT:    pmulld %xmm2, %xmm0
+; X64-NEXT:    pmulld %xmm2, %xmm1
 ; X64-NEXT:    retq
 ;
 ; X64-XOP-LABEL: mul_v8i32_17:
@@ -484,26 +474,16 @@ define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
 define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind {
 ; X86-LABEL: mul_v16i16_17:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm2
-; X86-NEXT:    psllw $4, %xmm2
-; X86-NEXT:    paddw %xmm0, %xmm2
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    psllw $4, %xmm3
-; X86-NEXT:    paddw %xmm1, %xmm3
-; X86-NEXT:    movdqa %xmm2, %xmm0
-; X86-NEXT:    movdqa %xmm3, %xmm1
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
+; X86-NEXT:    pmullw %xmm2, %xmm0
+; X86-NEXT:    pmullw %xmm2, %xmm1
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_v16i16_17:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa %xmm0, %xmm2
-; X64-NEXT:    psllw $4, %xmm2
-; X64-NEXT:    paddw %xmm0, %xmm2
-; X64-NEXT:    movdqa %xmm1, %xmm3
-; X64-NEXT:    psllw $4, %xmm3
-; X64-NEXT:    paddw %xmm1, %xmm3
-; X64-NEXT:    movdqa %xmm2, %xmm0
-; X64-NEXT:    movdqa %xmm3, %xmm1
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
+; X64-NEXT:    pmullw %xmm2, %xmm0
+; X64-NEXT:    pmullw %xmm2, %xmm1
 ; X64-NEXT:    retq
 ;
 ; X64-XOP-LABEL: mul_v16i16_17:
@@ -797,32 +777,16 @@ define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind {
 define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
 ; X86-LABEL: mul_v8i32_neg33:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm3
-; X86-NEXT:    pslld $5, %xmm3
-; X86-NEXT:    paddd %xmm0, %xmm3
-; X86-NEXT:    pxor %xmm2, %xmm2
-; X86-NEXT:    pxor %xmm0, %xmm0
-; X86-NEXT:    psubd %xmm3, %xmm0
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    pslld $5, %xmm3
-; X86-NEXT:    paddd %xmm1, %xmm3
-; X86-NEXT:    psubd %xmm3, %xmm2
-; X86-NEXT:    movdqa %xmm2, %xmm1
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
+; X86-NEXT:    pmulld %xmm2, %xmm0
+; X86-NEXT:    pmulld %xmm2, %xmm1
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_v8i32_neg33:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa %xmm0, %xmm3
-; X64-NEXT:    pslld $5, %xmm3
-; X64-NEXT:    paddd %xmm0, %xmm3
-; X64-NEXT:    pxor %xmm2, %xmm2
-; X64-NEXT:    pxor %xmm0, %xmm0
-; X64-NEXT:    psubd %xmm3, %xmm0
-; X64-NEXT:    movdqa %xmm1, %xmm3
-; X64-NEXT:    pslld $5, %xmm3
-; X64-NEXT:    paddd %xmm1, %xmm3
-; X64-NEXT:    psubd %xmm3, %xmm2
-; X64-NEXT:    movdqa %xmm2, %xmm1
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
+; X64-NEXT:    pmulld %xmm2, %xmm0
+; X64-NEXT:    pmulld %xmm2, %xmm1
 ; X64-NEXT:    retq
 ;
 ; X64-XOP-LABEL: mul_v8i32_neg33:
@@ -855,32 +819,16 @@ define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
 define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind {
 ; X86-LABEL: mul_v16i16_neg9:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa %xmm0, %xmm3
-; X86-NEXT:    psllw $3, %xmm3
-; X86-NEXT:    paddw %xmm0, %xmm3
-; X86-NEXT:    pxor %xmm2, %xmm2
-; X86-NEXT:    pxor %xmm0, %xmm0
-; X86-NEXT:    psubw %xmm3, %xmm0
-; X86-NEXT:    movdqa %xmm1, %xmm3
-; X86-NEXT:    psllw $3, %xmm3
-; X86-NEXT:    paddw %xmm1, %xmm3
-; X86-NEXT:    psubw %xmm3, %xmm2
-; X86-NEXT:    movdqa %xmm2, %xmm1
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527]
+; X86-NEXT:    pmullw %xmm2, %xmm0
+; X86-NEXT:    pmullw %xmm2, %xmm1
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: mul_v16i16_neg9:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa %xmm0, %xmm3
-; X64-NEXT:    psllw $3, %xmm3
-; X64-NEXT:    paddw %xmm0, %xmm3
-; X64-NEXT:    pxor %xmm2, %xmm2
-; X64-NEXT:    pxor %xmm0, %xmm0
-; X64-NEXT:    psubw %xmm3, %xmm0
-; X64-NEXT:    movdqa %xmm1, %xmm3
-; X64-NEXT:    psllw $3, %xmm3
-; X64-NEXT:    paddw %xmm1, %xmm3
-; X64-NEXT:    psubw %xmm3, %xmm2
-; X64-NEXT:    movdqa %xmm2, %xmm1
+; X64-NEXT:    movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527]
+; X64-NEXT:    pmullw %xmm2, %xmm0
+; X64-NEXT:    pmullw %xmm2, %xmm1
 ; X64-NEXT:    retq
 ;
 ; X64-XOP-LABEL: mul_v16i16_neg9: