diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-01-24 19:20:02 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-01-24 19:20:02 +0000 |
| commit | 9f551ad60423cedacdc502e98729fb8ad1a4ce9b (patch) | |
| tree | ac3f8e8e8e8c8dcbbf6cf6a9fb6143b31a762301 /llvm/lib | |
| parent | a9263c8963a94334fe9951147485b1e77892d0d1 (diff) | |
| download | bcm5719-llvm-9f551ad60423cedacdc502e98729fb8ad1a4ce9b.tar.gz bcm5719-llvm-9f551ad60423cedacdc502e98729fb8ad1a4ce9b.zip | |
[X86][SSE] Aggressively use PMADDWD for v4i32 multiplies with 17 or more leading zeros
As discussed in D41484, PMADDWD for 'zero extended' vXi32 is nearly always a better option than PMULLD:
On SNB it will result in code that isn't any faster, but not any slower so we may as well keep it.
On KNL it only has half the throughput, so I've disabled it on there - ideally there'd be a better way than this.
Differential Revision: https://reviews.llvm.org/D42258
llvm-svn: 323367
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 28 |
1 files changed, 20 insertions, 8 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 50edb59eb8f..b520f0ac905 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -22326,7 +22326,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() && "Should not custom lower when pmulld is available!"); - // If the upper 17 bits of each element are zero then we can use PMADD. + // If the upper 17 bits of each element are zero then we can use PMADDWD. APInt Mask17 = APInt::getHighBitsSet(32, 17); if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17)) return DAG.getNode(X86ISD::VPMADDWD, dl, VT, @@ -32707,13 +32707,6 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, if ((NumElts % 2) != 0) return SDValue(); - // If the upper 17 bits of each element are zero then we can use PMADD. - APInt Mask17 = APInt::getHighBitsSet(32, 17); - if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) && - DAG.MaskedValueIsZero(N1, Mask17)) - return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0), - DAG.getBitcast(MVT::v8i16, N1)); - unsigned RegSize = 128; MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); @@ -32885,6 +32878,25 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); + + // If the upper 17 bits of each element are zero then we can use PMADDWD, + // which is always at least as quick as PMULLD, expect on KNL. + if (Subtarget.getProcFamily() != X86Subtarget::IntelKNL && + ((VT == MVT::v4i32 && Subtarget.hasSSE2()) || + (VT == MVT::v8i32 && Subtarget.hasAVX2()) || + (VT == MVT::v16i32 && Subtarget.hasBWI()))) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + APInt Mask17 = APInt::getHighBitsSet(32, 17); + if (DAG.MaskedValueIsZero(N0, Mask17) && + DAG.MaskedValueIsZero(N1, Mask17)) { + unsigned NumElts = VT.getVectorNumElements(); + MVT WVT = MVT::getVectorVT(MVT::i16, 2 * NumElts); + return DAG.getNode(X86ISD::VPMADDWD, SDLoc(N), VT, + DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1)); + } + } + if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); |

