diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 113 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/masked_gather_scatter.ll | 171 |
2 files changed, 120 insertions, 164 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d93ec99ed89..44815757515 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -24385,47 +24385,32 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, } MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); // If the index is v2i32, we're being called by type legalization and we // should just let the default handling take care of it. if (IndexVT == MVT::v2i32) return SDValue(); - unsigned NumElts = VT.getVectorNumElements(); + // If we don't have VLX and neither the passthru or index is 512-bits, we + // need to widen until one is. if (!Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { - // AVX512F supports only 512-bit vectors. Or data or index should - // be 512 bit wide. If now the both index and data are 256-bit, but - // the vector contains 8 elements, we just sign-extend the index - if (IndexVT == MVT::v8i32) - // Just extend index - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - else { - // The minimal number of elts in scatter is 8 - NumElts = 8; - // Index - MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); - // Use original index here, do not modify the index twice - Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); - if (IndexVT.getScalarType() == MVT::i32) - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - - // Mask - // At this point we have promoted mask operand - assert(Mask.getValueType().getScalarType() == MVT::i1 && - "unexpected mask type"); - MVT ExtMaskVT = MVT::getVectorVT(MVT::i1, NumElts); - // Use the original mask here, do not modify the mask twice - Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); - - // The value that should be stored - MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); - Src = ExtendToType(Src, NewVT, DAG); - } - } - - // The mask is killed by scatter, add it to the values - SDVTList VTs = DAG.getVTList(Mask.getValueType(), MVT::Other); + // Determine how much we need to widen by to get a 512-bit type. + unsigned Factor = std::min(512/VT.getSizeInBits(), + 512/IndexVT.getSizeInBits()); + unsigned NumElts = VT.getVectorNumElements() * Factor; + + VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); + IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); + MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + + Src = ExtendToType(Src, VT, DAG); + Index = ExtendToType(Index, IndexVT, DAG); + Mask = ExtendToType(Mask, MaskVT, DAG, true); + } + + SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); @@ -24532,68 +24517,46 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - SDValue Scale = N->getScale(); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue Src0 = N->getValue(); MVT IndexVT = Index.getSimpleValueType(); MVT MaskVT = Mask.getSimpleValueType(); - unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); // If the index is v2i32, we're being called by type legalization. if (IndexVT == MVT::v2i32) return SDValue(); + // If we don't have VLX and neither the passthru or index is 512-bits, we + // need to widen until one is. + MVT OrigVT = VT; if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && - !Index.getSimpleValueType().is512BitVector()) { - // AVX512F supports only 512-bit vectors. Or data or index should - // be 512 bit wide. If now the both index and data are 256-bit, but - // the vector contains 8 elements, we just sign-extend the index - if (NumElts == 8) { - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index, - Scale }; - SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(), - N->getMemOperand()); - return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl); - } - - // Minimal number of elements in Gather - NumElts = 8; - // Index - MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); - Index = ExtendToType(Index, NewIndexVT, DAG); - if (IndexVT.getScalarType() == MVT::i32) - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - - // Mask - assert(MaskVT.getScalarType() == MVT::i1 && "unexpected mask type"); - MaskVT = MVT::getVectorVT(MVT::i1, NumElts); - Mask = ExtendToType(Mask, MaskVT, DAG, true); + !IndexVT.is512BitVector()) { + // Determine how much we need to widen by to get a 512-bit type. + unsigned Factor = std::min(512/VT.getSizeInBits(), + 512/IndexVT.getSizeInBits()); + + unsigned NumElts = VT.getVectorNumElements() * Factor; - // The pass-through value - MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); - Src0 = ExtendToType(Src0, NewVT, DAG); + VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); + IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); + MaskVT = MVT::getVectorVT(MVT::i1, NumElts); - SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index, Scale }; - SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(NewVT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(), - N->getMemOperand()); - SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - NewGather.getValue(0), - DAG.getIntPtrConstant(0, dl)); - SDValue RetOps[] = {Extract, NewGather.getValue(2)}; - return DAG.getMergeValues(RetOps, dl); + Src0 = ExtendToType(Src0, VT, DAG); + Index = ExtendToType(Index, IndexVT, DAG); + Mask = ExtendToType(Mask, MaskVT, DAG, true); } - SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index, Scale }; + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index, + N->getScale() }; SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(), N->getMemOperand()); - return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl); + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, + NewGather, DAG.getIntPtrConstant(0, dl)); + return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl); } SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 723b6e95741..941fdc6c15b 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -299,12 +299,14 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { ; ; KNL_32-LABEL: test6: ; KNL_32: # %bb.0: -; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2 -; KNL_32-NEXT: kxnorw %k0, %k0, %k2 -; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2} -; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1} -; KNL_32-NEXT: vmovdqa %ymm1, %ymm0 +; KNL_32-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1 +; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 +; KNL_32-NEXT: movw $255, %ax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: kmovw %k1, %k2 +; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm2 {%k2} +; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} +; KNL_32-NEXT: vmovdqa %ymm2, %ymm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test6: @@ -335,25 +337,29 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { ; ; KNL_64-LABEL: test7: ; KNL_64: # %bb.0: -; KNL_64-NEXT: kmovw %esi, %k1 -; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 +; KNL_64-NEXT: kmovw %esi, %k0 +; KNL_64-NEXT: kshiftlw $8, %k0, %k0 +; KNL_64-NEXT: kshiftrw $8, %k0, %k1 ; KNL_64-NEXT: kmovw %k1, %k2 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2} -; KNL_64-NEXT: vmovdqa %ymm1, %ymm2 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} +; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2 +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test7: ; KNL_32: # %bb.0: +; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_32-NEXT: kmovw %ecx, %k0 +; KNL_32-NEXT: kshiftlw $8, %k0, %k0 +; KNL_32-NEXT: kshiftrw $8, %k0, %k1 ; KNL_32-NEXT: kmovw %k1, %k2 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2} -; KNL_32-NEXT: vmovdqa %ymm1, %ymm2 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 ; KNL_32-NEXT: retl ; @@ -486,10 +492,11 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68] ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 -; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1 +; KNL_32-NEXT: movw $255, %ax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1} +; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL_32-NEXT: retl ; ; SKX_SMALL-LABEL: test9: @@ -571,10 +578,11 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68] ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 -; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1 +; KNL_32-NEXT: movw $255, %ax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1} +; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL_32-NEXT: retl ; ; SKX_SMALL-LABEL: test10: @@ -811,28 +819,26 @@ declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { ; KNL_64-LABEL: test15: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1 -; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 -; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; KNL_64-NEXT: vmovaps %xmm1, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test15: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 -; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} +; KNL_32-NEXT: vmovaps %xmm1, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -869,8 +875,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1 -; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} +; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} ; KNL_64-NEXT: vmovapd %ymm2, %ymm0 ; KNL_64-NEXT: retq ; @@ -883,8 +888,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_32-NEXT: kshiftlw $12, %k0, %k0 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} +; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1} ; KNL_32-NEXT: vmovapd %ymm2, %ymm0 ; KNL_32-NEXT: retl ; @@ -989,14 +993,13 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { ; ; KNL_32-LABEL: test18: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1 -; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; KNL_32-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1 +; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1 -; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1081,14 +1084,13 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { ; ; KNL_32-LABEL: test20: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0 +; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0 +; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: vscatterdps %zmm0, (,%zmm1) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1130,10 +1132,9 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1163,29 +1164,27 @@ declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { ; KNL_64-LABEL: test22: ; KNL_64: # %bb.0: -; KNL_64-NEXT: # kill: def %xmm2 killed %xmm2 def %ymm2 +; KNL_64-NEXT: # kill: def %xmm2 killed %xmm2 def %zmm2 +; KNL_64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} +; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} ; KNL_64-NEXT: vmovaps %xmm2, %xmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test22: ; KNL_32: # %bb.0: -; KNL_32-NEXT: # kill: def %xmm2 killed %xmm2 def %ymm2 +; KNL_32-NEXT: # kill: def %xmm2 killed %xmm2 def %zmm2 +; KNL_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} +; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm2 {%k1} ; KNL_32-NEXT: vmovaps %xmm2, %xmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -1271,12 +1270,11 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 -; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_64-NEXT: kshiftlw $14, %k0, %k0 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1} +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} ; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1286,12 +1284,11 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; KNL_32-NEXT: kshiftlw $14, %k0, %k0 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1} +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} ; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -1377,10 +1374,9 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test24: ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_64-NEXT: movb $3, %al +; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1} +; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} ; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq @@ -1389,10 +1385,9 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_32-NEXT: movb $3, %cl +; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1} +; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} ; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl @@ -1531,24 +1526,22 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { define <2 x float> @test27(float* %base, <2 x i32> %ind) { ; KNL_64-LABEL: test27: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 -; KNL_64-NEXT: movb $3, %al +; KNL_64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; KNL_64-NEXT: movw $3, %ax ; KNL_64-NEXT: kmovw %eax, %k1 -; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; KNL_64-NEXT: vzeroupper ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test27: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 -; KNL_32-NEXT: movb $3, %cl +; KNL_32-NEXT: movw $3, %cx ; KNL_32-NEXT: kmovw %ecx, %k1 -; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -1590,10 +1583,9 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_32-NEXT: movb $3, %al +; KNL_32-NEXT: movw $3, %ax ; KNL_32-NEXT: kmovw %eax, %k1 -; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} +; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} ; KNL_32-NEXT: vzeroupper ; KNL_32-NEXT: retl ; @@ -2361,8 +2353,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1 -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 -; KNL_32-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1} +; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k1} ; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; KNL_32-NEXT: movl %ebp, %esp @@ -2628,11 +2619,12 @@ define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) { ; KNL_64-LABEL: sext_v8i8_index: ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; KNL_64-NEXT: kxnorw %k0, %k0, %k1 ; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0 -; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm0 -; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 -; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} +; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm1 +; KNL_64-NEXT: movw $255, %ax +; KNL_64-NEXT: kmovw %eax, %k1 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; KNL_64-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: sext_v8i8_index: @@ -2640,10 +2632,11 @@ define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) { ; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0 -; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm0 -; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 -; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} +; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm1 +; KNL_32-NEXT: movw $255, %cx +; KNL_32-NEXT: kmovw %ecx, %k1 +; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: sext_v8i8_index: |