diff options
| author | Craig Topper <craig.topper@intel.com> | 2019-08-01 18:49:07 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2019-08-01 18:49:07 +0000 |
| commit | a9ed5436bdf250f3d9f1ecda4f2a3541ea4b037d (patch) | |
| tree | 8af6846ba6904ff08bd1f3616496609f0a0100b9 | |
| parent | 005cc423168196b2e0c7b39d5465744bad951bf3 (diff) | |
| download | bcm5719-llvm-a9ed5436bdf250f3d9f1ecda4f2a3541ea4b037d.tar.gz bcm5719-llvm-a9ed5436bdf250f3d9f1ecda4f2a3541ea4b037d.zip | |
[X86] In decomposeMulByConstant, legalize the VT before querying whether the multiply is legal
If a type is larger than a legal type and needs to be split, we would previously allow the multiply to be decomposed even if the split multiply is legal. Since the shift + add/sub code would also need to be split, its not any better to decompose it.
This patch figures out what type the mul will eventually be legalized to and then uses that type for the query. I tried just returning false illegal types and letting them get handled after type legalization, but then we can't recognize and i64 constant splat on 32-bit targets since will be destroyed by type legalization. We could special case vectors of i64 to avoid that...
Differential Revision: https://reviews.llvm.org/D65533
llvm-svn: 367601
| -rw-r--r-- | llvm/include/llvm/CodeGen/TargetLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 14 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-mul.ll | 100 |
5 files changed, 41 insertions, 81 deletions
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index ebe8872e5de..3d6f4700255 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1904,7 +1904,8 @@ public: /// This may be true if the target does not directly support the /// multiplication operation for the specified type or the sequence of simpler /// ops is faster than the multiply. - virtual bool decomposeMulByConstant(EVT VT, SDValue C) const { + virtual bool decomposeMulByConstant(LLVMContext &Context, + EVT VT, SDValue C) const { return false; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 185a9a72fa3..9d8850b59f5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3556,7 +3556,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { // x * 15 --> (x << 4) - x // x * -33 --> -((x << 5) + x) // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4) - if (N1IsConst && TLI.decomposeMulByConstant(VT, N1)) { + if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) { // TODO: We could handle more general decomposition of any constant by // having the target set a limit on number of ops and making a // callback to determine that sequence (similar to sqrt expansion). diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4b8987b0a02..46b31894df7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4869,15 +4869,25 @@ bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { return true; } -bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const { +bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { // TODO: We handle scalars using custom code, but generic combining could make // that unnecessary. APInt MulC; if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; + // Find the type this will be legalized too. Otherwise we might prematurely + // convert this to shl+add/sub and then still have to type legalize those ops. + // Another choice would be to defer the decision for illegal types until + // after type legalization. But constant splat vectors of i64 can't make it + // through type legalization on 32-bit targets so we would need to special + // case vXi64. + while (getTypeAction(Context, VT) != TypeLegal) + VT = getTypeToTransformTo(Context, VT); + // If vector multiply is legal, assume that's faster than shl + add/sub. - // TODO: Multiply is a complex op with higher latency and lower througput in + // TODO: Multiply is a complex op with higher latency and lower throughput in // most implementations, so this check could be loosened based on type // and/or a CPU attribute. if (isOperationLegal(ISD::MUL, VT)) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 625b42d3515..8dc58a188dd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1105,7 +1105,8 @@ namespace llvm { bool convertSelectOfConstantsToMath(EVT VT) const override; - bool decomposeMulByConstant(EVT VT, SDValue C) const override; + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, bool IsSigned) const override; diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 1377d1ce920..805ff9f69ed 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -435,26 +435,16 @@ define <4 x i64> @mul_v4i64_17(<4 x i64> %a0) nounwind { define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind { ; X86-LABEL: mul_v8i32_17: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pslld $4, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pslld $4, %xmm3 -; X86-NEXT: paddd %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: movdqa %xmm3, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17] +; X86-NEXT: pmulld %xmm2, %xmm0 +; X86-NEXT: pmulld %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v8i32_17: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pslld $4, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: pslld $4, %xmm3 -; X64-NEXT: paddd %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17] +; X64-NEXT: pmulld %xmm2, %xmm0 +; X64-NEXT: pmulld %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v8i32_17: @@ -484,26 +474,16 @@ define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind { define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind { ; X86-LABEL: mul_v16i16_17: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psllw $4, %xmm2 -; X86-NEXT: paddw %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: psllw $4, %xmm3 -; X86-NEXT: paddw %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: movdqa %xmm3, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] +; X86-NEXT: pmullw %xmm2, %xmm0 +; X86-NEXT: pmullw %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i16_17: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: psllw $4, %xmm2 -; X64-NEXT: paddw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: psllw $4, %xmm3 -; X64-NEXT: paddw %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] +; X64-NEXT: pmullw %xmm2, %xmm0 +; X64-NEXT: pmullw %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i16_17: @@ -797,32 +777,16 @@ define <4 x i64> @mul_v4i64_neg1025(<4 x i64> %a0) nounwind { define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind { ; X86-LABEL: mul_v8i32_neg33: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: pslld $5, %xmm3 -; X86-NEXT: paddd %xmm0, %xmm3 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: pxor %xmm0, %xmm0 -; X86-NEXT: psubd %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pslld $5, %xmm3 -; X86-NEXT: paddd %xmm1, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263] +; X86-NEXT: pmulld %xmm2, %xmm0 +; X86-NEXT: pmulld %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v8i32_neg33: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: pslld $5, %xmm3 -; X64-NEXT: paddd %xmm0, %xmm3 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: psubd %xmm3, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: pslld $5, %xmm3 -; X64-NEXT: paddd %xmm1, %xmm3 -; X64-NEXT: psubd %xmm3, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263] +; X64-NEXT: pmulld %xmm2, %xmm0 +; X64-NEXT: pmulld %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v8i32_neg33: @@ -855,32 +819,16 @@ define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind { define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind { ; X86-LABEL: mul_v16i16_neg9: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: psllw $3, %xmm3 -; X86-NEXT: paddw %xmm0, %xmm3 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: pxor %xmm0, %xmm0 -; X86-NEXT: psubw %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: psllw $3, %xmm3 -; X86-NEXT: paddw %xmm1, %xmm3 -; X86-NEXT: psubw %xmm3, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527] +; X86-NEXT: pmullw %xmm2, %xmm0 +; X86-NEXT: pmullw %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i16_neg9: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psllw $3, %xmm3 -; X64-NEXT: paddw %xmm0, %xmm3 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: psubw %xmm3, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: psllw $3, %xmm3 -; X64-NEXT: paddw %xmm1, %xmm3 -; X64-NEXT: psubw %xmm3, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527] +; X64-NEXT: pmullw %xmm2, %xmm0 +; X64-NEXT: pmullw %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i16_neg9: |

