diff options
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 26 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 36 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/bitcast-setcc-256.ll | 16 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/bitcast-setcc-512.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/dagcombine-cse.ll | 13 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/masked_store.ll | 82 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/movmsk-cmp.ll | 112 | 
8 files changed, 116 insertions, 181 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 636fa6c1dab..eb089e5201f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1471,12 +1471,36 @@ bool TargetLowering::SimplifyDemandedBits(        if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,                                 KnownSrcBits, TLO, Depth + 1))          return true; +    } else if ((NumSrcEltBits % BitWidth) == 0 && +               TLO.DAG.getDataLayout().isLittleEndian()) { +      unsigned Scale = NumSrcEltBits / BitWidth; +      unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1; +      APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits); +      APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts); +      for (unsigned i = 0; i != NumElts; ++i) +        if (DemandedElts[i]) { +          unsigned Offset = (i % Scale) * BitWidth; +          DemandedSrcBits.insertBits(DemandedBits, Offset); +          DemandedSrcElts.setBit(i / Scale); +        } + +      if (SrcVT.isVector()) { +        APInt KnownSrcUndef, KnownSrcZero; +        if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef, +                                       KnownSrcZero, TLO, Depth + 1)) +          return true; +      } + +      KnownBits KnownSrcBits; +      if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, +                               KnownSrcBits, TLO, Depth + 1)) +        return true;      }      // If this is a bitcast, let computeKnownBits handle it.  Only do this on a      // recursive call where Known may be useful to the caller.      if (Depth > 0) { -      Known = TLO.DAG.computeKnownBits(Op, Depth); +      Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);        return false;      }      break; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 08affa8f057..9115f3040a2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3147,30 +3147,44 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,  SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,                                                  DAGCombinerInfo &DCI) const { -  if (N->getValueType(0) != MVT::i64) -    return SDValue(); - -  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); +  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));    if (!RHS)      return SDValue(); +  EVT VT = N->getValueType(0); +  SDValue LHS = N->getOperand(0);    unsigned ShiftAmt = RHS->getZExtValue(); +  SelectionDAG &DAG = DCI.DAG; +  SDLoc SL(N); + +  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1) +  // this improves the ability to match BFE patterns in isel. +  if (LHS.getOpcode() == ISD::AND) { +    if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) { +      if (Mask->getAPIntValue().isShiftedMask() && +          Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) { +        return DAG.getNode( +            ISD::AND, SL, VT, +            DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)), +            DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1))); +      } +    } +  } + +  if (VT != MVT::i64) +    return SDValue(); +    if (ShiftAmt < 32)      return SDValue();    // srl i64:x, C for C >= 32    // =>    //   build_pair (srl hi_32(x), C - 32), 0 - -  SelectionDAG &DAG = DCI.DAG; -  SDLoc SL(N); -    SDValue One = DAG.getConstant(1, SL, MVT::i32);    SDValue Zero = DAG.getConstant(0, SL, MVT::i32); -  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0)); -  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, -                           VecOp, One); +  SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS); +  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);    SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);    SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 13380e03e32..f5857a330fa 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -86,8 +86,8 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0  ; GFX9-NEXT:    v_mov_b32_e32 v2, s2  ; GFX9-NEXT:    ds_write_b16 v1, v2 offset:4  ; GFX9-NEXT:    s_waitcnt vmcnt(0) -; GFX9-NEXT:    v_and_b32_e32 v0, 0x7f0000, v0 -; GFX9-NEXT:    ds_write_b8_d16_hi v1, v0 offset:6 +; GFX9-NEXT:    v_bfe_u32 v0, v0, 16, 7 +; GFX9-NEXT:    ds_write_b8 v1, v0 offset:6  ; GFX9-NEXT:    ds_write_b32 v1, v3  ; GFX9-NEXT:    s_endpgm    store i55 %arg, i55 addrspace(3)* %ptr, align 8 diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll index 41635f37528..f9a233a583b 100644 --- a/llvm/test/CodeGen/X86/bitcast-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-256.ll @@ -448,22 +448,6 @@ define void @bitcast_8i32_store(i8* %p, <8 x i32> %a0) {  define void @bitcast_4i64_store(i4* %p, <4 x i64> %a0) {  ; SSE2-SSSE3-LABEL: bitcast_4i64_store:  ; SSE2-SSSE3:       # %bb.0: -; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm1 -; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm3 -; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm3 -; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm4 -; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4 -; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm1 -; SSE2-SSSE3-NEXT:    por %xmm4, %xmm1 -; SSE2-SSSE3-NEXT:    pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm3 -; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm3 -; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0 -; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0  ; SSE2-SSSE3-NEXT:    packssdw %xmm1, %xmm0  ; SSE2-SSSE3-NEXT:    movmskps %xmm0, %eax  ; SSE2-SSSE3-NEXT:    movb %al, (%rdi) diff --git a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll index 3c294345dd5..177be1fd6a6 100644 --- a/llvm/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-setcc-512.ll @@ -609,15 +609,13 @@ define void @bitcast_8i64_store(i8* %p, <8 x i64> %a0) {  ;  ; AVX1-LABEL: bitcast_8i64_store:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3  ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2  ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0  ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    vmovmskps %ymm0, %eax  ; AVX1-NEXT:    movb %al, (%rdi) diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll index bf1dab35875..a532d87170d 100644 --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -14,18 +14,11 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n  ;  ; X64-LABEL: t:  ; X64:       ## %bb.0: ## %entry -; X64-NEXT:    ## kill: def $edx killed $edx def $rdx -; X64-NEXT:    ## kill: def $esi killed $esi def $rsi  ; X64-NEXT:    imull %ecx, %esi -; X64-NEXT:    leal (%rsi,%rdx), %eax -; X64-NEXT:    cltq +; X64-NEXT:    addl %edx, %esi +; X64-NEXT:    movslq %esi, %rax  ; X64-NEXT:    movl (%rdi,%rax), %eax -; X64-NEXT:    leal 4(%rsi,%rdx), %ecx -; X64-NEXT:    movslq %ecx, %rcx -; X64-NEXT:    movzwl (%rdi,%rcx), %ecx -; X64-NEXT:    shlq $32, %rcx -; X64-NEXT:    orq %rax, %rcx -; X64-NEXT:    movq %rcx, %xmm0 +; X64-NEXT:    movq %rax, %xmm0  ; X64-NEXT:    movd %xmm0, %eax  ; X64-NEXT:    retq  entry: diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 4b26436f1b0..cb8d59a2b12 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -36,25 +36,21 @@ define void @store_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, <1 x dou  define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %val) {  ; SSE2-LABEL: store_v2f64_v2i64:  ; SSE2:       ## %bb.0: -; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE2-NEXT:    pxor %xmm3, %xmm0 -; SSE2-NEXT:    movdqa %xmm3, %xmm2 -; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT:    movdqa %xmm0, %xmm4 -; SSE2-NEXT:    pand %xmm2, %xmm4 -; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT:    por %xmm3, %xmm4 -; SSE2-NEXT:    movd %xmm4, %eax +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT:    pxor %xmm2, %xmm0 +; SSE2-NEXT:    movdqa %xmm2, %xmm3 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT:    pand %xmm3, %xmm2 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT:    por %xmm2, %xmm0 +; SSE2-NEXT:    movd %xmm0, %eax  ; SSE2-NEXT:    testb $1, %al  ; SSE2-NEXT:    je LBB1_2  ; SSE2-NEXT:  ## %bb.1: ## %cond.store  ; SSE2-NEXT:    movlpd %xmm1, (%rdi)  ; SSE2-NEXT:  LBB1_2: ## %else -; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; SSE2-NEXT:    pand %xmm2, %xmm0 -; SSE2-NEXT:    por %xmm3, %xmm0  ; SSE2-NEXT:    pextrw $4, %xmm0, %eax  ; SSE2-NEXT:    testb $1, %al  ; SSE2-NEXT:    je LBB1_4 @@ -117,20 +113,16 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou  ; SSE2-NEXT:    movdqa %xmm4, %xmm5  ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5  ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT:    movdqa %xmm0, %xmm7 -; SSE2-NEXT:    pand %xmm5, %xmm7 -; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT:    por %xmm6, %xmm7 -; SSE2-NEXT:    movd %xmm7, %eax +; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT:    pand %xmm5, %xmm6 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT:    por %xmm6, %xmm0 +; SSE2-NEXT:    movd %xmm0, %eax  ; SSE2-NEXT:    testb $1, %al  ; SSE2-NEXT:    je LBB2_2  ; SSE2-NEXT:  ## %bb.1: ## %cond.store  ; SSE2-NEXT:    movlpd %xmm2, (%rdi)  ; SSE2-NEXT:  LBB2_2: ## %else -; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] -; SSE2-NEXT:    pand %xmm5, %xmm0 -; SSE2-NEXT:    por %xmm6, %xmm0  ; SSE2-NEXT:    pextrw $4, %xmm0, %eax  ; SSE2-NEXT:    testb $1, %al  ; SSE2-NEXT:    je LBB2_4 @@ -140,10 +132,9 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou  ; SSE2-NEXT:    pxor %xmm4, %xmm1  ; SSE2-NEXT:    movdqa %xmm4, %xmm0  ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]  ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1  ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT:    pand %xmm2, %xmm1 +; SSE2-NEXT:    pand %xmm0, %xmm1  ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]  ; SSE2-NEXT:    por %xmm1, %xmm0  ; SSE2-NEXT:    pextrw $0, %xmm0, %eax @@ -863,25 +854,21 @@ define void @store_v16f32_v16i32(<16 x float> %x, <16 x float>* %ptr, <16 x floa  define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %val) {  ; SSE2-LABEL: store_v2i64_v2i64:  ; SSE2:       ## %bb.0: -; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE2-NEXT:    pxor %xmm3, %xmm0 -; SSE2-NEXT:    movdqa %xmm3, %xmm2 -; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT:    pcmpeqd %xmm3, %xmm0 -; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT:    movdqa %xmm0, %xmm4 -; SSE2-NEXT:    pand %xmm2, %xmm4 -; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT:    por %xmm3, %xmm4 -; SSE2-NEXT:    movd %xmm4, %eax +; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] +; SSE2-NEXT:    pxor %xmm2, %xmm0 +; SSE2-NEXT:    movdqa %xmm2, %xmm3 +; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3 +; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT:    pand %xmm3, %xmm2 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSE2-NEXT:    por %xmm2, %xmm0 +; SSE2-NEXT:    movd %xmm0, %eax  ; SSE2-NEXT:    testb $1, %al  ; SSE2-NEXT:    je LBB7_2  ; SSE2-NEXT:  ## %bb.1: ## %cond.store  ; SSE2-NEXT:    movq %xmm1, (%rdi)  ; SSE2-NEXT:  LBB7_2: ## %else -; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] -; SSE2-NEXT:    pand %xmm2, %xmm0 -; SSE2-NEXT:    por %xmm3, %xmm0  ; SSE2-NEXT:    pextrw $4, %xmm0, %eax  ; SSE2-NEXT:    testb $1, %al  ; SSE2-NEXT:    je LBB7_4 @@ -950,20 +937,16 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %  ; SSE2-NEXT:    movdqa %xmm4, %xmm5  ; SSE2-NEXT:    pcmpgtd %xmm0, %xmm5  ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT:    movdqa %xmm0, %xmm7 -; SSE2-NEXT:    pand %xmm5, %xmm7 -; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT:    por %xmm6, %xmm7 -; SSE2-NEXT:    movd %xmm7, %eax +; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT:    pand %xmm5, %xmm6 +; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-NEXT:    por %xmm6, %xmm0 +; SSE2-NEXT:    movd %xmm0, %eax  ; SSE2-NEXT:    testb $1, %al  ; SSE2-NEXT:    je LBB8_2  ; SSE2-NEXT:  ## %bb.1: ## %cond.store  ; SSE2-NEXT:    movq %xmm2, (%rdi)  ; SSE2-NEXT:  LBB8_2: ## %else -; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] -; SSE2-NEXT:    pand %xmm5, %xmm0 -; SSE2-NEXT:    por %xmm6, %xmm0  ; SSE2-NEXT:    pextrw $4, %xmm0, %eax  ; SSE2-NEXT:    testb $1, %al  ; SSE2-NEXT:    je LBB8_4 @@ -974,10 +957,9 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %  ; SSE2-NEXT:    pxor %xmm4, %xmm1  ; SSE2-NEXT:    movdqa %xmm4, %xmm0  ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]  ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1  ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT:    pand %xmm2, %xmm1 +; SSE2-NEXT:    pand %xmm0, %xmm1  ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]  ; SSE2-NEXT:    por %xmm1, %xmm0  ; SSE2-NEXT:    pextrw $0, %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 2b31acb6c88..ab6bf66de27 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -929,22 +929,6 @@ define i1 @allzeros_v16i32_sign(<16 x i32> %arg) {  define i1 @allones_v4i64_sign(<4 x i64> %arg) {  ; SSE2-LABEL: allones_v4i64_sign:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT:    pxor %xmm2, %xmm1 -; SSE2-NEXT:    movdqa %xmm2, %xmm3 -; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT:    movdqa %xmm2, %xmm4 -; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT:    pand %xmm3, %xmm1 -; SSE2-NEXT:    por %xmm4, %xmm1 -; SSE2-NEXT:    pxor %xmm2, %xmm0 -; SSE2-NEXT:    movdqa %xmm2, %xmm3 -; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-NEXT:    pand %xmm3, %xmm0 -; SSE2-NEXT:    por %xmm2, %xmm0  ; SSE2-NEXT:    packssdw %xmm1, %xmm0  ; SSE2-NEXT:    movmskps %xmm0, %eax  ; SSE2-NEXT:    cmpb $15, %al @@ -989,22 +973,6 @@ define i1 @allones_v4i64_sign(<4 x i64> %arg) {  define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {  ; SSE2-LABEL: allzeros_v4i64_sign:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT:    pxor %xmm2, %xmm1 -; SSE2-NEXT:    movdqa %xmm2, %xmm3 -; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT:    movdqa %xmm2, %xmm4 -; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT:    pand %xmm3, %xmm1 -; SSE2-NEXT:    por %xmm4, %xmm1 -; SSE2-NEXT:    pxor %xmm2, %xmm0 -; SSE2-NEXT:    movdqa %xmm2, %xmm3 -; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-NEXT:    pand %xmm3, %xmm0 -; SSE2-NEXT:    por %xmm2, %xmm0  ; SSE2-NEXT:    packssdw %xmm1, %xmm0  ; SSE2-NEXT:    movmskps %xmm0, %eax  ; SSE2-NEXT:    testb %al, %al @@ -1095,15 +1063,13 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) {  ;  ; AVX1-LABEL: allones_v8i64_sign:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3  ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2  ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0  ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    vmovmskps %ymm0, %eax  ; AVX1-NEXT:    cmpb $-1, %al @@ -1198,15 +1164,13 @@ define i1 @allzeros_v8i64_sign(<8 x i64> %arg) {  ;  ; AVX1-LABEL: allzeros_v8i64_sign:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3  ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2  ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0  ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    vmovmskps %ymm0, %eax  ; AVX1-NEXT:    testb %al, %al @@ -2539,19 +2503,17 @@ define i1 @allones_v8i64_and1(<8 x i64> %arg) {  ;  ; AVX1-LABEL: allones_v8i64_and1:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT:    vpsllq $63, %xmm2, %xmm2 -; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpsllq $63, %xmm1, %xmm1 -; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2  ; AVX1-NEXT:    vpsllq $63, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3  ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2  ; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0  ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0  ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpsllq $63, %xmm2, %xmm2 +; AVX1-NEXT:    vpsllq $63, %xmm1, %xmm1 +; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    vmovmskps %ymm0, %eax  ; AVX1-NEXT:    cmpb $-1, %al @@ -2615,19 +2577,17 @@ define i1 @allzeros_v8i64_and1(<8 x i64> %arg) {  ;  ; AVX1-LABEL: allzeros_v8i64_and1:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT:    vpsllq $63, %xmm2, %xmm2 -; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpsllq $63, %xmm1, %xmm1 -; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2  ; AVX1-NEXT:    vpsllq $63, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3  ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2  ; AVX1-NEXT:    vpsllq $63, %xmm0, %xmm0  ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0  ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpsllq $63, %xmm2, %xmm2 +; AVX1-NEXT:    vpsllq $63, %xmm1, %xmm1 +; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    vmovmskps %ymm0, %eax  ; AVX1-NEXT:    testb %al, %al @@ -3962,19 +3922,17 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) {  ;  ; AVX1-LABEL: allones_v8i64_and4:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT:    vpsllq $61, %xmm2, %xmm2 -; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpsllq $61, %xmm1, %xmm1 -; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2  ; AVX1-NEXT:    vpsllq $61, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3  ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2  ; AVX1-NEXT:    vpsllq $61, %xmm0, %xmm0  ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0  ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpsllq $61, %xmm2, %xmm2 +; AVX1-NEXT:    vpsllq $61, %xmm1, %xmm1 +; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    vmovmskps %ymm0, %eax  ; AVX1-NEXT:    cmpb $-1, %al @@ -4038,19 +3996,17 @@ define i1 @allzeros_v8i64_and4(<8 x i64> %arg) {  ;  ; AVX1-LABEL: allzeros_v8i64_and4:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT:    vpsllq $61, %xmm2, %xmm2 -; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpsllq $61, %xmm1, %xmm1 -; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2  ; AVX1-NEXT:    vpsllq $61, %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3  ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2  ; AVX1-NEXT:    vpsllq $61, %xmm0, %xmm0  ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm0  ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpsllq $61, %xmm2, %xmm2 +; AVX1-NEXT:    vpsllq $61, %xmm1, %xmm1 +; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    vmovmskps %ymm0, %eax  ; AVX1-NEXT:    testb %al, %al @@ -4170,22 +4126,6 @@ define i32 @movmskps(<4 x float> %x) {  define i32 @movmskpd256(<4 x double> %x) {  ; SSE2-LABEL: movmskpd256:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT:    pxor %xmm2, %xmm1 -; SSE2-NEXT:    movdqa %xmm2, %xmm3 -; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT:    movdqa %xmm2, %xmm4 -; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] -; SSE2-NEXT:    pand %xmm3, %xmm1 -; SSE2-NEXT:    por %xmm4, %xmm1 -; SSE2-NEXT:    pxor %xmm2, %xmm0 -; SSE2-NEXT:    movdqa %xmm2, %xmm3 -; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE2-NEXT:    pand %xmm3, %xmm0 -; SSE2-NEXT:    por %xmm2, %xmm0  ; SSE2-NEXT:    packssdw %xmm1, %xmm0  ; SSE2-NEXT:    movmskps %xmm0, %eax  ; SSE2-NEXT:    retq  | 

