[X86] Don't use zero_extend_vector_inreg for mulhu lowering with sse 4.1

Summary: With sse4.1 we use two zero_extend_vector_inreg and a pshufd to expand the v16i8 input into two v8i16 vectors for the multiply. That's 3 shuffles to extend one operand. The other operand is usually constant as this is mostly used by division by constant optimization. Pre sse4.1 we use a punpckhbw and a punpcklbw with a zero vector. That's two shuffles and an xor and a copy due to tied register constraints. That seems maybe better than the 3 shuffles. With AVX we avoid the copy so that's obviously better. Reviewers: spatel, RKSimon Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D55138 llvm-svn: 348079
author: Craig Topper <craig.topper@intel.com> 2018-12-01 19:26:31 +0000
committer: Craig Topper <craig.topper@intel.com> 2018-12-01 19:26:31 +0000
commit: f4b13927e7700b5dde7984e4b470bfcfafad5e59 (patch)
tree: d1ad6b4cc15787361ae62e3bde72dd69e428b4a3 /llvm/lib
parent: 8bffb634970ce0ffc1f979d8399f1d415f60a688 (diff)
download: bcm5719-llvm-f4b13927e7700b5dde7984e4b470bfcfafad5e59.tar.gz
bcm5719-llvm-f4b13927e7700b5dde7984e4b470bfcfafad5e59.zip
1 files changed, 11 insertions, 8 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 475a1c646bf..951856b046a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23706,14 +23706,17 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
   // shift the results and pack the half lane results back together.
 
   MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
-  unsigned ExSSE41 = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
-                              : ISD::ZERO_EXTEND_VECTOR_INREG;
 
   // Extract the lo parts and zero/sign extend to i16.
+  // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
+  // shifts to sign extend. Using unpack for unsigned only requires an xor to
+  // create zeros and a copy due to tied registers contraints pre-avx. But using
+  // zero_extend_vector_inreg would require an additional pshufd for the high
+  // part.
   SDValue ALo, BLo;
-  if (VT == MVT::v16i8 && Subtarget.hasSSE41()) {
-    ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
-    BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+  if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
+    ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
+    BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
   } else if (IsSigned) {
     ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A);
     BLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B);
@@ -23730,13 +23733,13 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
 
   // Extract the hi parts and zero/sign extend to i16.
   SDValue AHi, BHi;
-  if (VT == MVT::v16i8 && Subtarget.hasSSE41()) {
+  if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
     const int ShufMask[] = { 8,  9, 10, 11, 12, 13, 14, 15,
                             -1, -1, -1, -1, -1, -1, -1, -1};
     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-    AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
-    BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+    AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
+    BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
   } else if (IsSigned) {
     AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A);
     BHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B);
author	Craig Topper <craig.topper@intel.com>	2018-12-01 19:26:31 +0000
committer	Craig Topper <craig.topper@intel.com>	2018-12-01 19:26:31 +0000
commit	f4b13927e7700b5dde7984e4b470bfcfafad5e59 (patch)
tree	d1ad6b4cc15787361ae62e3bde72dd69e428b4a3 /llvm/lib
parent	8bffb634970ce0ffc1f979d8399f1d415f60a688 (diff)
download	bcm5719-llvm-f4b13927e7700b5dde7984e4b470bfcfafad5e59.tar.gz bcm5719-llvm-f4b13927e7700b5dde7984e4b470bfcfafad5e59.zip