diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 70 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/masked_gather_scatter.ll | 64 |
2 files changed, 48 insertions, 86 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 052300d6f72..54ca3721cf5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42572,16 +42572,17 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, SDValue Base = GorS->getBasePtr(); SDValue Scale = GorS->getScale(); - // Shrink constant indices if they are larger than 32-bits. - // Only do this before legalize types since v2i64 could become v2i32. - // FIXME: We could check that the type is legal if we're after legalize types, - // but then we would need to construct test cases where that happens. - // FIXME: We could support more than just constant vectors, but we need to - // careful with costing. A truncate that can be optimized out would be fine. - // Otherwise we might only want to create a truncate if it avoids a split. if (DCI.isBeforeLegalize()) { + unsigned IndexWidth = Index.getScalarValueSizeInBits(); + + // Shrink constant indices if they are larger than 32-bits. + // Only do this before legalize types since v2i64 could become v2i32. + // FIXME: We could check that the type is legal if we're after legalize + // types, but then we would need to construct test cases where that happens. + // FIXME: We could support more than just constant vectors, but we need to + // careful with costing. A truncate that can be optimized out would be fine. + // Otherwise we might only want to create a truncate if it avoids a split. if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) { - unsigned IndexWidth = Index.getScalarValueSizeInBits(); if (BV->isConstant() && IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { unsigned NumElts = Index.getValueType().getVectorNumElements(); @@ -42604,16 +42605,18 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, Scatter->getIndexType()); } } - } - if (DCI.isBeforeLegalizeOps()) { - // Remove any sign extends from 32 or smaller to larger than 32. - // Only do this before LegalizeOps in case we need the sign extend for - // legalization. - if (Index.getOpcode() == ISD::SIGN_EXTEND && - Index.getScalarValueSizeInBits() > 32 && - Index.getOperand(0).getScalarValueSizeInBits() <= 32) { - Index = Index.getOperand(0); + // Shrink any sign/zero extends from 32 or smaller to larger than 32 if + // there are sufficient sign bits. Only do this before legalize types to + // avoid creating illegal types in truncate. + if ((Index.getOpcode() == ISD::SIGN_EXTEND || + Index.getOpcode() == ISD::ZERO_EXTEND) && + IndexWidth > 32 && + Index.getOperand(0).getScalarValueSizeInBits() <= 32 && + DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { + unsigned NumElts = Index.getValueType().getVectorNumElements(); + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { SDValue Ops[] = { Chain, Gather->getPassThru(), Mask, Base, Index, Scale } ; @@ -42630,11 +42633,14 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, Ops, Scatter->getMemOperand(), Scatter->getIndexType()); } + } + + if (DCI.isBeforeLegalizeOps()) { + unsigned IndexWidth = Index.getScalarValueSizeInBits(); // Make sure the index is either i32 or i64 - unsigned ScalarSize = Index.getScalarValueSizeInBits(); - if (ScalarSize != 32 && ScalarSize != 64) { - MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32; + if (IndexWidth != 32 && IndexWidth != 64) { + MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32; EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT, Index.getValueType().getVectorNumElements()); Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); @@ -42654,30 +42660,6 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, Ops, Scatter->getMemOperand(), Scatter->getIndexType()); } - - // Try to remove zero extends from 32->64 if we know the sign bit of - // the input is zero. - if (Index.getOpcode() == ISD::ZERO_EXTEND && - Index.getScalarValueSizeInBits() == 64 && - Index.getOperand(0).getScalarValueSizeInBits() == 32 && - DAG.SignBitIsZero(Index.getOperand(0))) { - Index = Index.getOperand(0); - if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { - SDValue Ops[] = { Chain, Gather->getPassThru(), - Mask, Base, Index, Scale } ; - return DAG.getMaskedGather(Gather->getVTList(), - Gather->getMemoryVT(), DL, Ops, - Gather->getMemOperand(), - Gather->getIndexType()); - } - auto *Scatter = cast<MaskedScatterSDNode>(GorS); - SDValue Ops[] = { Chain, Scatter->getValue(), - Mask, Base, Index, Scale }; - return DAG.getMaskedScatter(Scatter->getVTList(), - Scatter->getMemoryVT(), DL, - Ops, Scatter->getMemOperand(), - Scatter->getIndexType()); - } } // With vector masks we only demand the upper bit of the mask. diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index fe870e51538..2e4edb99094 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -2693,56 +2693,32 @@ declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1 define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) { ; KNL_64-LABEL: zext_i8_index: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: kxnorw %k0, %k0, %k2 -; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2} -; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: zext_i8_index: ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: kxnorw %k0, %k0, %k2 -; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2} -; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; KNL_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: zext_i8_index: ; SKX: # %bb.0: -; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; SKX-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kxnorw %k0, %k0, %k2 -; SKX-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2} -; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: zext_i8_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; SKX_32-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: kxnorw %k0, %k0, %k2 -; SKX_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2} -; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; SKX_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; SKX_32-NEXT: retl %zext_ind = zext <16 x i8> %ind to <16 x i64> @@ -2756,32 +2732,36 @@ define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) { define <8 x float> @zext_v8i8_index(float* %base, <8 x i8> %ind) { ; KNL_64-LABEL: zext_v8i8_index: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} +; KNL_64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; KNL_64-NEXT: movw $255, %ax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: zext_v8i8_index: ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} +; KNL_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; KNL_32-NEXT: movw $255, %cx +; KNL_32-NEXT: kmovw %ecx, %k1 +; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: zext_v8i8_index: ; SKX: # %bb.0: -; SKX-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} +; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: zext_v8i8_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; SKX_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; SKX_32-NEXT: retl %zext_ind = zext <8 x i8> %ind to <8 x i64> |