diff options
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 30 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vsel-cmp-load.ll | 67 | 
2 files changed, 61 insertions, 36 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 77e5cdd8f8e..e0316aa3f69 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7331,6 +7331,36 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {        AddToWorklist(Add.getNode());        return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);      } + +    // If this select has a condition (setcc) with narrower operands than the +    // select, try to widen the compare to match the select width. +    // TODO: This should be extended to handle any constant. +    // TODO: This could be extended to handle non-loading patterns, but that +    //       requires thorough testing to avoid regressions. +    if (isNullConstantOrNullSplatConstant(RHS)) { +      EVT NarrowVT = LHS.getValueType(); +      EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger(); +      EVT SetCCVT = getSetCCResultType(LHS.getValueType()); +      unsigned SetCCWidth = SetCCVT.getScalarSizeInBits(); +      unsigned WideWidth = WideVT.getScalarSizeInBits(); +      bool IsSigned = isSignedIntSetCC(CC); +      auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD; +      if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() && +          SetCCWidth != 1 && SetCCWidth < WideWidth && +          TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) && +          TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) { +        // Both compare operands can be widened for free. The LHS can use an +        // extended load, and the RHS is a constant: +        //   vselect (ext (setcc load(X), C)), N1, N2 --> +        //   vselect (setcc extload(X), C'), N1, N2 +        auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; +        SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS); +        SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS); +        EVT WideSetCCVT = getSetCCResultType(WideVT); +        SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC); +        return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2); +      } +    }    }    if (SimplifySelectOps(N, N1, N2)) diff --git a/llvm/test/CodeGen/X86/vsel-cmp-load.ll b/llvm/test/CodeGen/X86/vsel-cmp-load.ll index b95f1a6e491..b9f18fe89a5 100644 --- a/llvm/test/CodeGen/X86/vsel-cmp-load.ll +++ b/llvm/test/CodeGen/X86/vsel-cmp-load.ll @@ -21,11 +21,9 @@ define <8 x i32> @eq_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) {  ;  ; AVX2-LABEL: eq_zero:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero  ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT:    vpslld $24, %ymm2, %ymm2 +; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2  ; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ; @@ -58,12 +56,11 @@ define <4 x i64> @ne_zero(<4 x i16>* %p, <4 x i64> %x, <4 x i64> %y) {  ;  ; AVX2-LABEL: ne_zero:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero  ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2  ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ; @@ -96,10 +93,9 @@ define <16 x i16> @sgt_zero(<16 x i8>* %p, <16 x i16> %x, <16 x i16> %y) {  ;  ; AVX2-LABEL: sgt_zero:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vmovdqa (%rdi), %xmm2 +; AVX2-NEXT:    vpmovsxbw (%rdi), %ymm2  ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT:    vpcmpgtb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2 +; AVX2-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2  ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ; @@ -119,23 +115,21 @@ define <8 x i32> @slt_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) {  ; AVX1-LABEL: slt_zero:  ; AVX1:       # %bb.0:  ; AVX1-NEXT:    vpmovsxbw (%rdi), %xmm2 -; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX1-NEXT:    vpslld $24, %xmm3, %xmm3 -; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX1-NEXT:    vpslld $24, %xmm2, %xmm2 +; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm3 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT:    vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm4, %xmm3  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2  ; AVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ;  ; AVX2-LABEL: slt_zero:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovsxbw (%rdi), %xmm2 +; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm2  ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm2 -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT:    vpslld $24, %ymm2, %ymm2 +; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm3, %ymm2  ; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ; @@ -166,10 +160,9 @@ define <4 x double> @eq_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x doubl  ;  ; AVX2-LABEL: eq_zero_fp_select:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero  ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT:    vpcmpeqq %ymm3, %ymm2, %ymm2  ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ; @@ -203,13 +196,11 @@ define <8 x float> @ne_zero_fp_select(<8 x i8>* %p, <8 x float> %x, <8 x float>  ;  ; AVX2-LABEL: ne_zero_fp_select:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero  ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT:    vpcmpeqw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT:    vpslld $24, %ymm2, %ymm2 +; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2  ; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ; @@ -229,21 +220,21 @@ define <4 x double> @sgt_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x doub  ; AVX1-LABEL: sgt_zero_fp_select:  ; AVX1:       # %bb.0:  ; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm2 -; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm2  ; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm3  ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]  ; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2 +; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2  ; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ;  ; AVX2-LABEL: sgt_zero_fp_select:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovsxbd (%rdi), %xmm2 +; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm2  ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2 +; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2  ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ; @@ -260,6 +251,8 @@ define <4 x double> @sgt_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x doub    ret <4 x double> %sel  } +; FIXME: The compare with 0 for AVX2 should be eliminated. +  define <8 x float> @slt_zero_fp_select(<8 x i16>* %p, <8 x float> %x, <8 x float> %y) {  ; AVX1-LABEL: slt_zero_fp_select:  ; AVX1:       # %bb.0: @@ -272,6 +265,8 @@ define <8 x float> @slt_zero_fp_select(<8 x i16>* %p, <8 x float> %x, <8 x float  ; AVX2-LABEL: slt_zero_fp_select:  ; AVX2:       # %bb.0:  ; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm2 +; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm3, %ymm2  ; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ;  | 

