diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 59 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/shrink_vmul.ll | 152 | 
2 files changed, 100 insertions, 111 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index becf658956d..b76ba58b6c0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -32906,6 +32906,46 @@ static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,    return SDValue();  } +// If the upper 17 bits of each element are zero then we can use PMADDWD, +// which is always at least as quick as PMULLD, expect on KNL. +static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, +                                   TargetLowering::DAGCombinerInfo &DCI, +                                   const X86Subtarget &Subtarget) { +  if (!Subtarget.hasSSE2()) +    return SDValue(); + +  if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL) +    return SDValue(); + +  EVT VT = N->getValueType(0); + +  // Only support vXi32 vectors. +  if (!VT.isVector() || VT.getVectorElementType() != MVT::i32) +    return SDValue(); + +  // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. +  MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); +  if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT)) +    return SDValue(); + +  SDValue N0 = N->getOperand(0); +  SDValue N1 = N->getOperand(1); +  APInt Mask17 = APInt::getHighBitsSet(32, 17); +  if (!DAG.MaskedValueIsZero(N1, Mask17) || +      !DAG.MaskedValueIsZero(N0, Mask17)) +    return SDValue(); + +  // Use SplitBinaryOpsAndApply to handle AVX splitting. +  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, SDValue Op0, +                           SDValue Op1) { +    MVT VT = MVT::getVectorVT(MVT::i32, Op0.getValueSizeInBits() / 32); +    return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Op0, Op1); +  }; +  return SplitBinaryOpsAndApply(DAG, Subtarget, SDLoc(N), VT, +                                DAG.getBitcast(WVT, N0), +                                DAG.getBitcast(WVT, N1), PMADDWDBuilder); +} +  /// Optimize a single multiply with constant into two operations in order to  /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.  static SDValue combineMul(SDNode *N, SelectionDAG &DAG, @@ -32913,23 +32953,8 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,                            const X86Subtarget &Subtarget) {    EVT VT = N->getValueType(0); -  // If the upper 17 bits of each element are zero then we can use PMADDWD, -  // which is always at least as quick as PMULLD, expect on KNL. -  if (Subtarget.getProcFamily() != X86Subtarget::IntelKNL && -      ((VT == MVT::v4i32 && Subtarget.hasSSE2()) || -       (VT == MVT::v8i32 && Subtarget.hasAVX2()) || -       (VT == MVT::v16i32 && Subtarget.useBWIRegs()))) { -    SDValue N0 = N->getOperand(0); -    SDValue N1 = N->getOperand(1); -    APInt Mask17 = APInt::getHighBitsSet(32, 17); -    if (DAG.MaskedValueIsZero(N0, Mask17) && -        DAG.MaskedValueIsZero(N1, Mask17)) { -      unsigned NumElts = VT.getVectorNumElements(); -      MVT WVT = MVT::getVectorVT(MVT::i16, 2 * NumElts); -      return DAG.getNode(X86ISD::VPMADDWD, SDLoc(N), VT, -                         DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1)); -    } -  } +  if (SDValue V = combineMulToPMADDWD(N, DAG, DCI, Subtarget)) +    return V;    if (DCI.isBeforeLegalize() && VT.isVector())      return reduceVMULWidth(N, DAG, Subtarget); diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index ed7d05fd4bc..aee3892a41e 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -2227,89 +2227,67 @@ define void @PR34947() {  ;  ; X86-AVX1-LABEL: PR34947:  ; X86-AVX1:       # %bb.0: -; X86-AVX1-NEXT:    pushl %ebp -; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8 -; X86-AVX1-NEXT:    pushl %ebx -; X86-AVX1-NEXT:    .cfi_def_cfa_offset 12 -; X86-AVX1-NEXT:    pushl %edi -; X86-AVX1-NEXT:    .cfi_def_cfa_offset 16  ; X86-AVX1-NEXT:    pushl %esi -; X86-AVX1-NEXT:    .cfi_def_cfa_offset 20 -; X86-AVX1-NEXT:    subl $16, %esp -; X86-AVX1-NEXT:    .cfi_def_cfa_offset 36 -; X86-AVX1-NEXT:    .cfi_offset %esi, -20 -; X86-AVX1-NEXT:    .cfi_offset %edi, -16 -; X86-AVX1-NEXT:    .cfi_offset %ebx, -12 -; X86-AVX1-NEXT:    .cfi_offset %ebp, -8 +; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT:    .cfi_offset %esi, -8  ; X86-AVX1-NEXT:    vmovdqa (%eax), %ymm0 -; X86-AVX1-NEXT:    xorl %eax, %eax -; X86-AVX1-NEXT:    xorl %edx, %edx -; X86-AVX1-NEXT:    divl (%eax) -; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx +; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx  ; X86-AVX1-NEXT:    xorl %eax, %eax  ; X86-AVX1-NEXT:    xorl %edx, %edx  ; X86-AVX1-NEXT:    divl %ecx -; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx +; X86-AVX1-NEXT:    movl %edx, %ecx +; X86-AVX1-NEXT:    vmovd %xmm0, %esi  ; X86-AVX1-NEXT:    xorl %eax, %eax  ; X86-AVX1-NEXT:    xorl %edx, %edx -; X86-AVX1-NEXT:    divl %ecx -; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx +; X86-AVX1-NEXT:    divl %esi +; X86-AVX1-NEXT:    vmovd %edx, %xmm1 +; X86-AVX1-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1 +; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx  ; X86-AVX1-NEXT:    xorl %eax, %eax  ; X86-AVX1-NEXT:    xorl %edx, %edx  ; X86-AVX1-NEXT:    divl %ecx -; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill -; X86-AVX1-NEXT:    vmovd %xmm0, %ecx +; X86-AVX1-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1 +; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx  ; X86-AVX1-NEXT:    xorl %eax, %eax  ; X86-AVX1-NEXT:    xorl %edx, %edx  ; X86-AVX1-NEXT:    divl %ecx -; X86-AVX1-NEXT:    movl %edx, %ebp +; X86-AVX1-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm1  ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx  ; X86-AVX1-NEXT:    xorl %eax, %eax  ; X86-AVX1-NEXT:    xorl %edx, %edx -; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx  ; X86-AVX1-NEXT:    divl %ecx  ; X86-AVX1-NEXT:    movl %edx, %ecx +; X86-AVX1-NEXT:    vmovd %xmm0, %esi  ; X86-AVX1-NEXT:    xorl %eax, %eax  ; X86-AVX1-NEXT:    xorl %edx, %edx -; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %esi  ; X86-AVX1-NEXT:    divl %esi -; X86-AVX1-NEXT:    movl %edx, %esi +; X86-AVX1-NEXT:    vmovd %edx, %xmm2 +; X86-AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2 +; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx  ; X86-AVX1-NEXT:    xorl %eax, %eax  ; X86-AVX1-NEXT:    xorl %edx, %edx -; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edi -; X86-AVX1-NEXT:    divl %edi -; X86-AVX1-NEXT:    movl %edx, %edi +; X86-AVX1-NEXT:    divl %ecx +; X86-AVX1-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2 +; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx  ; X86-AVX1-NEXT:    xorl %eax, %eax  ; X86-AVX1-NEXT:    xorl %edx, %edx -; X86-AVX1-NEXT:    vmovd %xmm0, %ebx -; X86-AVX1-NEXT:    divl %ebx -; X86-AVX1-NEXT:    vmovd %edx, %xmm0 -; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vmovd %ebp, %xmm1 -; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload -; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero -; X86-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007 -; X86-AVX1-NEXT:    vmovd %eax, %xmm3 -; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] -; X86-AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpmaddwd %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT:    divl %ecx +; X86-AVX1-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0 +; X86-AVX1-NEXT:    xorl %eax, %eax +; X86-AVX1-NEXT:    xorl %edx, %edx +; X86-AVX1-NEXT:    divl (%eax) +; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm1, %xmm1  ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm1 +; X86-AVX1-NEXT:    vmovd %edx, %xmm1 +; X86-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007 +; X86-AVX1-NEXT:    vmovd %eax, %xmm2 +; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1  ; X86-AVX1-NEXT:    vmovd %xmm1, (%eax)  ; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax) -; X86-AVX1-NEXT:    addl $16, %esp  ; X86-AVX1-NEXT:    popl %esi -; X86-AVX1-NEXT:    popl %edi -; X86-AVX1-NEXT:    popl %ebx -; X86-AVX1-NEXT:    popl %ebp  ; X86-AVX1-NEXT:    vzeroupper  ; X86-AVX1-NEXT:    retl  ; @@ -2421,77 +2399,63 @@ define void @PR34947() {  ;  ; X64-AVX1-LABEL: PR34947:  ; X64-AVX1:       # %bb.0: -; X64-AVX1-NEXT:    pushq %rbp -; X64-AVX1-NEXT:    .cfi_def_cfa_offset 16 -; X64-AVX1-NEXT:    pushq %rbx -; X64-AVX1-NEXT:    .cfi_def_cfa_offset 24 -; X64-AVX1-NEXT:    .cfi_offset %rbx, -24 -; X64-AVX1-NEXT:    .cfi_offset %rbp, -16  ; X64-AVX1-NEXT:    vmovdqa (%rax), %ymm0 +; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx  ; X64-AVX1-NEXT:    xorl %eax, %eax  ; X64-AVX1-NEXT:    xorl %edx, %edx -; X64-AVX1-NEXT:    divl (%rax) -; X64-AVX1-NEXT:    movl %edx, %r8d -; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT:    divl %ecx +; X64-AVX1-NEXT:    movl %edx, %ecx +; X64-AVX1-NEXT:    vmovd %xmm0, %esi  ; X64-AVX1-NEXT:    xorl %eax, %eax  ; X64-AVX1-NEXT:    xorl %edx, %edx -; X64-AVX1-NEXT:    divl %ecx -; X64-AVX1-NEXT:    movl %edx, %r9d +; X64-AVX1-NEXT:    divl %esi +; X64-AVX1-NEXT:    vmovd %edx, %xmm1 +; X64-AVX1-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1  ; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx  ; X64-AVX1-NEXT:    xorl %eax, %eax  ; X64-AVX1-NEXT:    xorl %edx, %edx  ; X64-AVX1-NEXT:    divl %ecx -; X64-AVX1-NEXT:    movl %edx, %r10d -; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx +; X64-AVX1-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1 +; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx  ; X64-AVX1-NEXT:    xorl %eax, %eax  ; X64-AVX1-NEXT:    xorl %edx, %edx  ; X64-AVX1-NEXT:    divl %ecx -; X64-AVX1-NEXT:    movl %edx, %r11d -; X64-AVX1-NEXT:    vmovd %xmm0, %ecx +; X64-AVX1-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm1 +; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx  ; X64-AVX1-NEXT:    xorl %eax, %eax  ; X64-AVX1-NEXT:    xorl %edx, %edx  ; X64-AVX1-NEXT:    divl %ecx -; X64-AVX1-NEXT:    movl %edx, %esi -; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0 -; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT:    movl %edx, %ecx +; X64-AVX1-NEXT:    vmovd %xmm0, %esi  ; X64-AVX1-NEXT:    xorl %eax, %eax  ; X64-AVX1-NEXT:    xorl %edx, %edx -; X64-AVX1-NEXT:    divl %ecx -; X64-AVX1-NEXT:    movl %edx, %edi +; X64-AVX1-NEXT:    divl %esi +; X64-AVX1-NEXT:    vmovd %edx, %xmm2 +; X64-AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2  ; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx  ; X64-AVX1-NEXT:    xorl %eax, %eax  ; X64-AVX1-NEXT:    xorl %edx, %edx  ; X64-AVX1-NEXT:    divl %ecx -; X64-AVX1-NEXT:    movl %edx, %ecx -; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %ebx +; X64-AVX1-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx  ; X64-AVX1-NEXT:    xorl %eax, %eax  ; X64-AVX1-NEXT:    xorl %edx, %edx -; X64-AVX1-NEXT:    divl %ebx -; X64-AVX1-NEXT:    movl %edx, %ebx -; X64-AVX1-NEXT:    vmovd %xmm0, %ebp +; X64-AVX1-NEXT:    divl %ecx +; X64-AVX1-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0  ; X64-AVX1-NEXT:    xorl %eax, %eax  ; X64-AVX1-NEXT:    xorl %edx, %edx -; X64-AVX1-NEXT:    divl %ebp -; X64-AVX1-NEXT:    vmovd %edx, %xmm0 -; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] -; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vmovd %esi, %xmm2 -; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2 -; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2 -; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2 -; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT:    divl (%rax) +; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X64-AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpmaddwd %xmm2, %xmm1, %xmm1  ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X64-AVX1-NEXT:    vmovd %r8d, %xmm1 +; X64-AVX1-NEXT:    vmovd %edx, %xmm1  ; X64-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007  ; X64-AVX1-NEXT:    vmovd %eax, %xmm2  ; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1  ; X64-AVX1-NEXT:    vmovd %xmm1, (%rax)  ; X64-AVX1-NEXT:    vmovaps %ymm0, (%rax) -; X64-AVX1-NEXT:    popq %rbx -; X64-AVX1-NEXT:    popq %rbp  ; X64-AVX1-NEXT:    vzeroupper  ; X64-AVX1-NEXT:    retq  ;  | 

