summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp14
-rw-r--r--llvm/test/CodeGen/X86/shrink_vmul.ll46
-rw-r--r--llvm/test/CodeGen/X86/slow-pmulld.ll16
3 files changed, 36 insertions, 40 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ad2d4b55bda..ba3b02e25a9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22086,6 +22086,13 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!");
+ // If the upper 17 bits of each element are zero then we can use PMADD.
+ APInt Mask17 = APInt::getHighBitsSet(32, 17);
+ if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
+ return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
+ DAG.getBitcast(MVT::v8i16, A),
+ DAG.getBitcast(MVT::v8i16, B));
+
// Extract the odd parts.
static const int UnpackMask[] = { 1, -1, 3, -1 };
SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
@@ -32259,6 +32266,13 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
if ((NumElts % 2) != 0)
return SDValue();
+ // If the upper 17 bits of each element are zero then we can use PMADD.
+ APInt Mask17 = APInt::getHighBitsSet(32, 17);
+ if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
+ DAG.MaskedValueIsZero(N1, Mask17))
+ return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
+ DAG.getBitcast(MVT::v8i16, N1));
+
unsigned RegSize = 128;
MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index a516c709517..ced3a40e4a4 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -112,13 +112,14 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-SSE-NEXT: movl c, %esi
; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
+; X86-SSE-NEXT: pxor %xmm1, %xmm1
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X86-SSE-NEXT: pmaddwd %xmm0, %xmm2
+; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
; X86-SSE-NEXT: popl %esi
; X86-SSE-NEXT: retl
;
@@ -142,13 +143,14 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
+; X64-SSE-NEXT: pxor %xmm1, %xmm1
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2
+; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mul_4xi8:
@@ -2215,13 +2217,7 @@ define void @PR34947() {
; X86-SSE-NEXT: xorl %edx, %edx
; X86-SSE-NEXT: divl (%eax)
; X86-SSE-NEXT: movd %edx, %xmm0
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm3
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm1
; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007
; X86-SSE-NEXT: movd %eax, %xmm2
; X86-SSE-NEXT: pmuludq %xmm0, %xmm2
@@ -2415,13 +2411,7 @@ define void @PR34947() {
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: divl (%rax)
; X64-SSE-NEXT: movd %edx, %xmm0
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X64-SSE-NEXT: pmuludq %xmm2, %xmm3
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm1
; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007
; X64-SSE-NEXT: movd %eax, %xmm2
; X64-SSE-NEXT: pmuludq %xmm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 4d73b11349f..325e6ee4085 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -10,22 +10,14 @@
define <4 x i32> @foo(<4 x i8> %A) {
; CHECK32-LABEL: foo:
; CHECK32: # %bb.0:
-; CHECK32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u]
-; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
-; CHECK32-NEXT: movdqa %xmm0, %xmm2
-; CHECK32-NEXT: pmullw %xmm1, %xmm0
-; CHECK32-NEXT: pmulhw %xmm1, %xmm2
-; CHECK32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
+; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: foo:
; CHECK64: # %bb.0:
-; CHECK64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u]
-; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
-; CHECK64-NEXT: movdqa %xmm0, %xmm2
-; CHECK64-NEXT: pmullw %xmm1, %xmm0
-; CHECK64-NEXT: pmulhw %xmm1, %xmm2
-; CHECK64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: foo:
OpenPOWER on IntegriCloud