summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2019-08-01 18:49:07 +0000
committerCraig Topper <craig.topper@intel.com>2019-08-01 18:49:07 +0000
commita9ed5436bdf250f3d9f1ecda4f2a3541ea4b037d (patch)
tree8af6846ba6904ff08bd1f3616496609f0a0100b9
parent005cc423168196b2e0c7b39d5465744bad951bf3 (diff)
downloadbcm5719-llvm-a9ed5436bdf250f3d9f1ecda4f2a3541ea4b037d.tar.gz
bcm5719-llvm-a9ed5436bdf250f3d9f1ecda4f2a3541ea4b037d.zip
[X86] In decomposeMulByConstant, legalize the VT before querying whether the multiply is legal
If a type is larger than a legal type and needs to be split, we would previously allow the multiply to be decomposed even if the split multiply is legal. Since the shift + add/sub code would also need to be split, its not any better to decompose it. This patch figures out what type the mul will eventually be legalized to and then uses that type for the query. I tried just returning false illegal types and letting them get handled after type legalization, but then we can't recognize and i64 constant splat on 32-bit targets since will be destroyed by type legalization. We could special case vectors of i64 to avoid that... Differential Revision: https://reviews.llvm.org/D65533 llvm-svn: 367601
-rw-r--r--llvm/include/llvm/CodeGen/TargetLowering.h3
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp14
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h3
-rw-r--r--llvm/test/CodeGen/X86/vector-mul.ll100
5 files changed, 41 insertions, 81 deletions
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index ebe8872e5de..3d6f4700255 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1904,7 +1904,8 @@ public:
/// This may be true if the target does not directly support the
/// multiplication operation for the specified type or the sequence of simpler
/// ops is faster than the multiply.
- virtual bool decomposeMulByConstant(EVT VT, SDValue C) const {
+ virtual bool decomposeMulByConstant(LLVMContext &Context,
+ EVT VT, SDValue C) const {
return false;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 185a9a72fa3..9d8850b59f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3556,7 +3556,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
// x * 15 --> (x << 4) - x
// x * -33 --> -((x << 5) + x)
// x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
- if (N1IsConst && TLI.decomposeMulByConstant(VT, N1)) {
+ if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
// TODO: We could handle more general decomposition of any constant by
// having the target set a limit on number of ops and making a
// callback to determine that sequence (similar to sqrt expansion).
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4b8987b0a02..46b31894df7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4869,15 +4869,25 @@ bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
return true;
}
-bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
+bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const {
// TODO: We handle scalars using custom code, but generic combining could make
// that unnecessary.
APInt MulC;
if (!ISD::isConstantSplatVector(C.getNode(), MulC))
return false;
+ // Find the type this will be legalized too. Otherwise we might prematurely
+ // convert this to shl+add/sub and then still have to type legalize those ops.
+ // Another choice would be to defer the decision for illegal types until
+ // after type legalization. But constant splat vectors of i64 can't make it
+ // through type legalization on 32-bit targets so we would need to special
+ // case vXi64.
+ while (getTypeAction(Context, VT) != TypeLegal)
+ VT = getTypeToTransformTo(Context, VT);
+
// If vector multiply is legal, assume that's faster than shl + add/sub.
- // TODO: Multiply is a complex op with higher latency and lower througput in
+ // TODO: Multiply is a complex op with higher latency and lower throughput in
// most implementations, so this check could be loosened based on type
// and/or a CPU attribute.
if (isOperationLegal(ISD::MUL, VT))
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 625b42d3515..8dc58a188dd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1105,7 +1105,8 @@ namespace llvm {
bool convertSelectOfConstantsToMath(EVT VT) const override;
- bool decomposeMulByConstant(EVT VT, SDValue C) const override;
+ bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const override;
bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
bool IsSigned) const override;
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 1377d1ce920..805ff9f69ed 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -435,26 +435,16 @@ define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind {
define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
; X86-LABEL: mul_v8i32_17:
; X86: # %bb.0:
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: pslld $4, %xmm2
-; X86-NEXT: paddd %xmm0, %xmm2
-; X86-NEXT: movdqa %xmm1, %xmm3
-; X86-NEXT: pslld $4, %xmm3
-; X86-NEXT: paddd %xmm1, %xmm3
-; X86-NEXT: movdqa %xmm2, %xmm0
-; X86-NEXT: movdqa %xmm3, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
+; X86-NEXT: pmulld %xmm2, %xmm0
+; X86-NEXT: pmulld %xmm2, %xmm1
; X86-NEXT: retl
;
; X64-LABEL: mul_v8i32_17:
; X64: # %bb.0:
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: pslld $4, %xmm2
-; X64-NEXT: paddd %xmm0, %xmm2
-; X64-NEXT: movdqa %xmm1, %xmm3
-; X64-NEXT: pslld $4, %xmm3
-; X64-NEXT: paddd %xmm1, %xmm3
-; X64-NEXT: movdqa %xmm2, %xmm0
-; X64-NEXT: movdqa %xmm3, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17]
+; X64-NEXT: pmulld %xmm2, %xmm0
+; X64-NEXT: pmulld %xmm2, %xmm1
; X64-NEXT: retq
;
; X64-XOP-LABEL: mul_v8i32_17:
@@ -484,26 +474,16 @@ define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind {
define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind {
; X86-LABEL: mul_v16i16_17:
; X86: # %bb.0:
-; X86-NEXT: movdqa %xmm0, %xmm2
-; X86-NEXT: psllw $4, %xmm2
-; X86-NEXT: paddw %xmm0, %xmm2
-; X86-NEXT: movdqa %xmm1, %xmm3
-; X86-NEXT: psllw $4, %xmm3
-; X86-NEXT: paddw %xmm1, %xmm3
-; X86-NEXT: movdqa %xmm2, %xmm0
-; X86-NEXT: movdqa %xmm3, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
+; X86-NEXT: pmullw %xmm2, %xmm0
+; X86-NEXT: pmullw %xmm2, %xmm1
; X86-NEXT: retl
;
; X64-LABEL: mul_v16i16_17:
; X64: # %bb.0:
-; X64-NEXT: movdqa %xmm0, %xmm2
-; X64-NEXT: psllw $4, %xmm2
-; X64-NEXT: paddw %xmm0, %xmm2
-; X64-NEXT: movdqa %xmm1, %xmm3
-; X64-NEXT: psllw $4, %xmm3
-; X64-NEXT: paddw %xmm1, %xmm3
-; X64-NEXT: movdqa %xmm2, %xmm0
-; X64-NEXT: movdqa %xmm3, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
+; X64-NEXT: pmullw %xmm2, %xmm0
+; X64-NEXT: pmullw %xmm2, %xmm1
; X64-NEXT: retq
;
; X64-XOP-LABEL: mul_v16i16_17:
@@ -797,32 +777,16 @@ define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind {
define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
; X86-LABEL: mul_v8i32_neg33:
; X86: # %bb.0:
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: pslld $5, %xmm3
-; X86-NEXT: paddd %xmm0, %xmm3
-; X86-NEXT: pxor %xmm2, %xmm2
-; X86-NEXT: pxor %xmm0, %xmm0
-; X86-NEXT: psubd %xmm3, %xmm0
-; X86-NEXT: movdqa %xmm1, %xmm3
-; X86-NEXT: pslld $5, %xmm3
-; X86-NEXT: paddd %xmm1, %xmm3
-; X86-NEXT: psubd %xmm3, %xmm2
-; X86-NEXT: movdqa %xmm2, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
+; X86-NEXT: pmulld %xmm2, %xmm0
+; X86-NEXT: pmulld %xmm2, %xmm1
; X86-NEXT: retl
;
; X64-LABEL: mul_v8i32_neg33:
; X64: # %bb.0:
-; X64-NEXT: movdqa %xmm0, %xmm3
-; X64-NEXT: pslld $5, %xmm3
-; X64-NEXT: paddd %xmm0, %xmm3
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pxor %xmm0, %xmm0
-; X64-NEXT: psubd %xmm3, %xmm0
-; X64-NEXT: movdqa %xmm1, %xmm3
-; X64-NEXT: pslld $5, %xmm3
-; X64-NEXT: paddd %xmm1, %xmm3
-; X64-NEXT: psubd %xmm3, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263]
+; X64-NEXT: pmulld %xmm2, %xmm0
+; X64-NEXT: pmulld %xmm2, %xmm1
; X64-NEXT: retq
;
; X64-XOP-LABEL: mul_v8i32_neg33:
@@ -855,32 +819,16 @@ define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind {
define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind {
; X86-LABEL: mul_v16i16_neg9:
; X86: # %bb.0:
-; X86-NEXT: movdqa %xmm0, %xmm3
-; X86-NEXT: psllw $3, %xmm3
-; X86-NEXT: paddw %xmm0, %xmm3
-; X86-NEXT: pxor %xmm2, %xmm2
-; X86-NEXT: pxor %xmm0, %xmm0
-; X86-NEXT: psubw %xmm3, %xmm0
-; X86-NEXT: movdqa %xmm1, %xmm3
-; X86-NEXT: psllw $3, %xmm3
-; X86-NEXT: paddw %xmm1, %xmm3
-; X86-NEXT: psubw %xmm3, %xmm2
-; X86-NEXT: movdqa %xmm2, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527]
+; X86-NEXT: pmullw %xmm2, %xmm0
+; X86-NEXT: pmullw %xmm2, %xmm1
; X86-NEXT: retl
;
; X64-LABEL: mul_v16i16_neg9:
; X64: # %bb.0:
-; X64-NEXT: movdqa %xmm0, %xmm3
-; X64-NEXT: psllw $3, %xmm3
-; X64-NEXT: paddw %xmm0, %xmm3
-; X64-NEXT: pxor %xmm2, %xmm2
-; X64-NEXT: pxor %xmm0, %xmm0
-; X64-NEXT: psubw %xmm3, %xmm0
-; X64-NEXT: movdqa %xmm1, %xmm3
-; X64-NEXT: psllw $3, %xmm3
-; X64-NEXT: paddw %xmm1, %xmm3
-; X64-NEXT: psubw %xmm3, %xmm2
-; X64-NEXT: movdqa %xmm2, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527]
+; X64-NEXT: pmullw %xmm2, %xmm0
+; X64-NEXT: pmullw %xmm2, %xmm1
; X64-NEXT: retq
;
; X64-XOP-LABEL: mul_v16i16_neg9:
OpenPOWER on IntegriCloud