diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 53 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/shrink_vmul-widen.ll | 147 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-idiv-v2i32.ll | 276 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/widen_cast-4.ll | 6 |
4 files changed, 239 insertions, 243 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9f8359322bd..209fb5885f7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -905,7 +905,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i16, Custom); + setOperationAction(ISD::LOAD, MVT::v8i8, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i16, Custom); + setOperationAction(ISD::STORE, MVT::v8i8, Custom); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); @@ -20073,14 +20079,24 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, if (St->isTruncatingStore()) return SDValue(); - assert(StoredVal.getValueType() == MVT::v2f32 && "Unexpected VT"); - - // Widen the vector, cast to a v2x64 type, extract the single 64-bit - // element and store it. - StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, StoredVal, - DAG.getUNDEF(MVT::v2f32)); - StoredVal = DAG.getBitcast(MVT::v2f64, StoredVal); - StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, StoredVal, + MVT StoreVT = StoredVal.getSimpleValueType(); + assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && + "Unexpected VT"); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) != + TargetLowering::TypeWidenVector) + return SDValue(); + + // Widen the vector, cast to a v2x64 type, extract the single 64-bit element + // and store it. + MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(), + StoreVT.getVectorNumElements() * 2); + StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, + DAG.getUNDEF(StoreVT)); + MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; + MVT CastVT = MVT::getVectorVT(StVT, 2); + StoredVal = DAG.getBitcast(CastVT, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, DAG.getIntPtrConstant(0, dl)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), @@ -26567,20 +26583,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, break; } case ISD::LOAD: { - // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids - // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast - // since type legalization will try to use an i64 load. - assert(N->getValueType(0) == MVT::v2f32 && "Unexpected VT"); + // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This + // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp + // cast since type legalization will try to use an i64 load. + MVT VT = N->getSimpleValueType(0); + assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT"); + if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) + return; if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast<LoadSDNode>(N); - SDValue Res = DAG.getLoad(MVT::f64, dl, Ld->getChain(), Ld->getBasePtr(), + MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; + SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Res); - Res = DAG.getBitcast(MVT::v4f32, Res); + MVT WideVT = MVT::getVectorVT(LdVT, 2); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res); + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() * 2); + Res = DAG.getBitcast(CastVT, Res); Results.push_back(Res); Results.push_back(Chain); return; diff --git a/llvm/test/CodeGen/X86/shrink_vmul-widen.ll b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll index 73be2a2f5de..b1278738ee5 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul-widen.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll @@ -31,9 +31,7 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -54,8 +52,7 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX-NEXT: vmovd %eax, %xmm1 ; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) -; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl @@ -206,10 +203,10 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -485,9 +482,7 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -503,8 +498,7 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) -; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl ; @@ -923,9 +917,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movd %xmm0, (%esi,%ecx,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -946,8 +938,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, ; X86-AVX-NEXT: vmovd %eax, %xmm1 ; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) -; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl @@ -1026,9 +1017,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm0, (%esi,%ecx,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1049,8 +1038,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl ; X86-AVX-NEXT: vmovd %eax, %xmm1 ; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) -; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl @@ -1124,9 +1112,7 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%esi,%ecx,4) +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1147,8 +1133,7 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b ; X86-AVX-NEXT: vmovd %eax, %xmm1 ; X86-AVX-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) -; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl @@ -1217,11 +1202,12 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: movd %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: movd %xmm3, 4(%esi,%ecx,4) +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1240,8 +1226,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%esi,%ecx,4) -; X86-AVX-NEXT: vmovd %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: retl @@ -1470,9 +1455,7 @@ define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst1: @@ -1488,8 +1471,7 @@ define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl @@ -1547,9 +1529,7 @@ define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst2: @@ -1565,8 +1545,7 @@ define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl @@ -1627,9 +1606,7 @@ define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst3: @@ -1645,8 +1622,7 @@ define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl @@ -1709,9 +1685,7 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst4: @@ -1727,8 +1701,7 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl @@ -1791,9 +1764,7 @@ define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst5: @@ -1809,8 +1780,7 @@ define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl @@ -1873,9 +1843,7 @@ define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi8_varconst6: @@ -1891,8 +1859,7 @@ define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl @@ -1952,9 +1919,7 @@ define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst1: @@ -1965,8 +1930,7 @@ define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst1: @@ -2019,9 +1983,7 @@ define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE-NEXT: movd %xmm0, 4(%edx,%eax,4) +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst2: @@ -2037,8 +1999,7 @@ define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl @@ -2092,13 +2053,14 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,65536,u,u> -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: movd %xmm3, 4(%edx,%eax,4) +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst3: @@ -2109,8 +2071,7 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: mul_2xi16_varconst3: @@ -2164,13 +2125,14 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,32768,u,u> -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X86-SSE-NEXT: movd %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: movd %xmm3, 4(%edx,%eax,4) +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: mul_2xi16_varconst4: @@ -2186,8 +2148,7 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: vmovd %ecx, %xmm0 ; X86-AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpextrd $1, %xmm0, 4(%edx,%eax,4) -; X86-AVX-NEXT: vmovd %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll index 2053be4df85..dc898e2bdec 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -88,31 +88,33 @@ define void @test_udiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; ; X86_WIDEN-LABEL: test_udiv7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl (%eax), %ecx -; X86_WIDEN-NEXT: movl 4(%eax), %esi -; X86_WIDEN-NEXT: movl $613566757, %ebx # imm = 0x24924925 +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: movl $613566757, %edi # imm = 0x24924925 ; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: mull %ebx +; X86_WIDEN-NEXT: mull %edi ; X86_WIDEN-NEXT: subl %edx, %ecx ; X86_WIDEN-NEXT: shrl %ecx ; X86_WIDEN-NEXT: addl %edx, %ecx ; X86_WIDEN-NEXT: shrl $2, %ecx -; X86_WIDEN-NEXT: movl %esi, %eax -; X86_WIDEN-NEXT: mull %ebx -; X86_WIDEN-NEXT: subl %edx, %esi -; X86_WIDEN-NEXT: shrl %esi -; X86_WIDEN-NEXT: addl %edx, %esi -; X86_WIDEN-NEXT: shrl $2, %esi -; X86_WIDEN-NEXT: movl %esi, 4(%edi) -; X86_WIDEN-NEXT: movl %ecx, (%edi) +; X86_WIDEN-NEXT: movd %ecx, %xmm1 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: mull %edi +; X86_WIDEN-NEXT: subl %edx, %ecx +; X86_WIDEN-NEXT: shrl %ecx +; X86_WIDEN-NEXT: addl %edx, %ecx +; X86_WIDEN-NEXT: shrl $2, %ecx +; X86_WIDEN-NEXT: movd %ecx, %xmm0 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%esi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, <i32 7, i32 7> @@ -230,27 +232,28 @@ define void @test_urem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; ; X86_WIDEN-LABEL: test_urem7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebp -; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl (%eax), %esi -; X86_WIDEN-NEXT: movl 4(%eax), %ecx -; X86_WIDEN-NEXT: movl $613566757, %ebx # imm = 0x24924925 -; X86_WIDEN-NEXT: movl %esi, %eax -; X86_WIDEN-NEXT: mull %ebx -; X86_WIDEN-NEXT: movl %esi, %ebp -; X86_WIDEN-NEXT: subl %edx, %ebp -; X86_WIDEN-NEXT: shrl %ebp -; X86_WIDEN-NEXT: addl %edx, %ebp -; X86_WIDEN-NEXT: shrl $2, %ebp -; X86_WIDEN-NEXT: leal (,%ebp,8), %eax -; X86_WIDEN-NEXT: subl %eax, %ebp -; X86_WIDEN-NEXT: addl %esi, %ebp +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: movl $613566757, %edi # imm = 0x24924925 +; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: mull %edi +; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: subl %edx, %eax +; X86_WIDEN-NEXT: shrl %eax +; X86_WIDEN-NEXT: addl %edx, %eax +; X86_WIDEN-NEXT: shrl $2, %eax +; X86_WIDEN-NEXT: leal (,%eax,8), %edx +; X86_WIDEN-NEXT: subl %edx, %eax +; X86_WIDEN-NEXT: addl %ecx, %eax +; X86_WIDEN-NEXT: movd %eax, %xmm1 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %ecx ; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: mull %ebx +; X86_WIDEN-NEXT: mull %edi ; X86_WIDEN-NEXT: movl %ecx, %eax ; X86_WIDEN-NEXT: subl %edx, %eax ; X86_WIDEN-NEXT: shrl %eax @@ -259,12 +262,11 @@ define void @test_urem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86_WIDEN-NEXT: leal (,%eax,8), %edx ; X86_WIDEN-NEXT: subl %edx, %eax ; X86_WIDEN-NEXT: addl %ecx, %eax -; X86_WIDEN-NEXT: movl %eax, 4(%edi) -; X86_WIDEN-NEXT: movl %ebp, (%edi) +; X86_WIDEN-NEXT: movd %eax, %xmm0 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%esi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx -; X86_WIDEN-NEXT: popl %ebp ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = urem <2 x i32> %a, <i32 7, i32 7> @@ -369,36 +371,37 @@ define void @test_sdiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; ; X86_WIDEN-LABEL: test_sdiv7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebp ; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl (%eax), %ecx -; X86_WIDEN-NEXT: movl 4(%eax), %esi -; X86_WIDEN-NEXT: movl $-1840700269, %ebp # imm = 0x92492493 -; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: imull %ebp -; X86_WIDEN-NEXT: movl %edx, %edi -; X86_WIDEN-NEXT: addl %ecx, %edi -; X86_WIDEN-NEXT: movl %edi, %eax -; X86_WIDEN-NEXT: shrl $31, %eax -; X86_WIDEN-NEXT: sarl $2, %edi -; X86_WIDEN-NEXT: addl %eax, %edi +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %esi +; X86_WIDEN-NEXT: movl $-1840700269, %ebx # imm = 0x92492493 ; X86_WIDEN-NEXT: movl %esi, %eax -; X86_WIDEN-NEXT: imull %ebp +; X86_WIDEN-NEXT: imull %ebx ; X86_WIDEN-NEXT: addl %esi, %edx ; X86_WIDEN-NEXT: movl %edx, %eax ; X86_WIDEN-NEXT: shrl $31, %eax ; X86_WIDEN-NEXT: sarl $2, %edx ; X86_WIDEN-NEXT: addl %eax, %edx -; X86_WIDEN-NEXT: movl %edx, 4(%ebx) -; X86_WIDEN-NEXT: movl %edi, (%ebx) +; X86_WIDEN-NEXT: movd %edx, %xmm0 +; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: imull %ebx +; X86_WIDEN-NEXT: addl %ecx, %edx +; X86_WIDEN-NEXT: movl %edx, %eax +; X86_WIDEN-NEXT: shrl $31, %eax +; X86_WIDEN-NEXT: sarl $2, %edx +; X86_WIDEN-NEXT: addl %eax, %edx +; X86_WIDEN-NEXT: movd %edx, %xmm1 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%edi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi ; X86_WIDEN-NEXT: popl %ebx -; X86_WIDEN-NEXT: popl %ebp ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, <i32 7, i32 7> @@ -521,28 +524,29 @@ define void @test_srem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; ; X86_WIDEN-LABEL: test_srem7_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebp ; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl (%eax), %edi -; X86_WIDEN-NEXT: movl 4(%eax), %ecx -; X86_WIDEN-NEXT: movl $-1840700269, %ebp # imm = 0x92492493 -; X86_WIDEN-NEXT: movl %edi, %eax -; X86_WIDEN-NEXT: imull %ebp -; X86_WIDEN-NEXT: movl %edx, %esi -; X86_WIDEN-NEXT: addl %edi, %esi +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %esi +; X86_WIDEN-NEXT: movl $-1840700269, %edi # imm = 0x92492493 ; X86_WIDEN-NEXT: movl %esi, %eax +; X86_WIDEN-NEXT: imull %edi +; X86_WIDEN-NEXT: addl %esi, %edx +; X86_WIDEN-NEXT: movl %edx, %eax ; X86_WIDEN-NEXT: shrl $31, %eax -; X86_WIDEN-NEXT: sarl $2, %esi -; X86_WIDEN-NEXT: addl %eax, %esi -; X86_WIDEN-NEXT: leal (,%esi,8), %eax -; X86_WIDEN-NEXT: subl %eax, %esi -; X86_WIDEN-NEXT: addl %edi, %esi +; X86_WIDEN-NEXT: sarl $2, %edx +; X86_WIDEN-NEXT: addl %eax, %edx +; X86_WIDEN-NEXT: leal (,%edx,8), %eax +; X86_WIDEN-NEXT: subl %eax, %edx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86_WIDEN-NEXT: addl %esi, %edx +; X86_WIDEN-NEXT: movd %edx, %xmm0 ; X86_WIDEN-NEXT: movl %ecx, %eax -; X86_WIDEN-NEXT: imull %ebp +; X86_WIDEN-NEXT: imull %edi ; X86_WIDEN-NEXT: addl %ecx, %edx ; X86_WIDEN-NEXT: movl %edx, %eax ; X86_WIDEN-NEXT: shrl $31, %eax @@ -551,12 +555,12 @@ define void @test_srem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86_WIDEN-NEXT: leal (,%edx,8), %eax ; X86_WIDEN-NEXT: subl %eax, %edx ; X86_WIDEN-NEXT: addl %ecx, %edx -; X86_WIDEN-NEXT: movl %edx, 4(%ebx) -; X86_WIDEN-NEXT: movl %esi, (%ebx) +; X86_WIDEN-NEXT: movd %edx, %xmm1 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%ebx) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi ; X86_WIDEN-NEXT: popl %ebx -; X86_WIDEN-NEXT: popl %ebp ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = srem <2 x i32> %a, <i32 7, i32 7> @@ -600,9 +604,7 @@ define void @test_udiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86_WIDEN-NEXT: psrld $3, %xmm0 -; X86_WIDEN-NEXT: movd %xmm0, (%eax) -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, 4(%eax) +; X86_WIDEN-NEXT: movq %xmm0, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, <i32 8, i32 8> @@ -645,11 +647,9 @@ define void @test_urem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86_WIDEN: # %bb.0: ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X86_WIDEN-NEXT: movd %xmm0, (%eax) -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, 4(%eax) +; X86_WIDEN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: andps {{\.LCPI.*}}, %xmm0 +; X86_WIDEN-NEXT: movlps %xmm0, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = urem <2 x i32> %a, <i32 8, i32 8> @@ -741,9 +741,7 @@ define void @test_sdiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86_WIDEN-NEXT: psrld $29, %xmm1 ; X86_WIDEN-NEXT: paddd %xmm0, %xmm1 ; X86_WIDEN-NEXT: psrad $3, %xmm1 -; X86_WIDEN-NEXT: movd %xmm1, (%eax) -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, 4(%eax) +; X86_WIDEN-NEXT: movq %xmm1, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, <i32 8, i32 8> @@ -787,9 +785,7 @@ define void @test_srem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X86_WIDEN-NEXT: psrld $3, %xmm0 -; X86_WIDEN-NEXT: movd %xmm0, (%eax) -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86_WIDEN-NEXT: movd %xmm0, 4(%eax) +; X86_WIDEN-NEXT: movq %xmm0, (%eax) ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, <i32 8, i32 8> @@ -874,25 +870,27 @@ define void @test_udiv_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi ; ; X86_WIDEN-LABEL: test_udiv_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebx -; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movl (%ecx), %eax -; X86_WIDEN-NEXT: movl 4(%ecx), %ecx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: movd %xmm1, %esi ; X86_WIDEN-NEXT: xorl %edx, %edx -; X86_WIDEN-NEXT: divl (%ebx) -; X86_WIDEN-NEXT: movl %eax, %esi -; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: divl %esi +; X86_WIDEN-NEXT: movd %eax, %xmm2 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: movd %xmm1, %esi ; X86_WIDEN-NEXT: xorl %edx, %edx -; X86_WIDEN-NEXT: divl 4(%ebx) -; X86_WIDEN-NEXT: movl %eax, 4(%edi) -; X86_WIDEN-NEXT: movl %esi, (%edi) +; X86_WIDEN-NEXT: divl %esi +; X86_WIDEN-NEXT: movd %eax, %xmm0 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm2, (%ecx) ; X86_WIDEN-NEXT: popl %esi -; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = load <2 x i32>, <2 x i32>* %y @@ -978,25 +976,27 @@ define void @test_urem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi ; ; X86_WIDEN-LABEL: test_urem_v2i32: ; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: pushl %ebx -; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movl (%ecx), %eax -; X86_WIDEN-NEXT: movl 4(%ecx), %ecx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: movd %xmm1, %esi ; X86_WIDEN-NEXT: xorl %edx, %edx -; X86_WIDEN-NEXT: divl (%ebx) -; X86_WIDEN-NEXT: movl %edx, %esi -; X86_WIDEN-NEXT: movl %ecx, %eax +; X86_WIDEN-NEXT: divl %esi +; X86_WIDEN-NEXT: movd %edx, %xmm2 +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: movd %xmm1, %esi ; X86_WIDEN-NEXT: xorl %edx, %edx -; X86_WIDEN-NEXT: divl 4(%ebx) -; X86_WIDEN-NEXT: movl %edx, 4(%edi) -; X86_WIDEN-NEXT: movl %esi, (%edi) +; X86_WIDEN-NEXT: divl %esi +; X86_WIDEN-NEXT: movd %edx, %xmm0 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm2, (%ecx) ; X86_WIDEN-NEXT: popl %esi -; X86_WIDEN-NEXT: popl %edi -; X86_WIDEN-NEXT: popl %ebx ; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = load <2 x i32>, <2 x i32>* %y @@ -1085,19 +1085,26 @@ define void @test_sdiv_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi ; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movl (%ecx), %eax -; X86_WIDEN-NEXT: movl 4(%ecx), %ecx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: movd %xmm1, %edi +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: movd %xmm1, %ebx ; X86_WIDEN-NEXT: cltd -; X86_WIDEN-NEXT: idivl (%ebx) -; X86_WIDEN-NEXT: movl %eax, %esi +; X86_WIDEN-NEXT: idivl %ebx +; X86_WIDEN-NEXT: movd %eax, %xmm0 ; X86_WIDEN-NEXT: movl %ecx, %eax ; X86_WIDEN-NEXT: cltd -; X86_WIDEN-NEXT: idivl 4(%ebx) -; X86_WIDEN-NEXT: movl %eax, 4(%edi) -; X86_WIDEN-NEXT: movl %esi, (%edi) +; X86_WIDEN-NEXT: idivl %edi +; X86_WIDEN-NEXT: movd %eax, %xmm1 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%esi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi ; X86_WIDEN-NEXT: popl %ebx @@ -1189,19 +1196,26 @@ define void @test_srem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi ; X86_WIDEN-NEXT: pushl %ebx ; X86_WIDEN-NEXT: pushl %edi ; X86_WIDEN-NEXT: pushl %esi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movl (%ecx), %eax -; X86_WIDEN-NEXT: movl 4(%ecx), %ecx +; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86_WIDEN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86_WIDEN-NEXT: movd %xmm0, %ecx +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86_WIDEN-NEXT: movd %xmm0, %eax +; X86_WIDEN-NEXT: movd %xmm1, %edi +; X86_WIDEN-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86_WIDEN-NEXT: movd %xmm1, %ebx ; X86_WIDEN-NEXT: cltd -; X86_WIDEN-NEXT: idivl (%ebx) -; X86_WIDEN-NEXT: movl %eax, %esi +; X86_WIDEN-NEXT: idivl %ebx +; X86_WIDEN-NEXT: movd %eax, %xmm0 ; X86_WIDEN-NEXT: movl %ecx, %eax ; X86_WIDEN-NEXT: cltd -; X86_WIDEN-NEXT: idivl 4(%ebx) -; X86_WIDEN-NEXT: movl %eax, 4(%edi) -; X86_WIDEN-NEXT: movl %esi, (%edi) +; X86_WIDEN-NEXT: idivl %edi +; X86_WIDEN-NEXT: movd %eax, %xmm1 +; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86_WIDEN-NEXT: movq %xmm1, (%esi) ; X86_WIDEN-NEXT: popl %esi ; X86_WIDEN-NEXT: popl %edi ; X86_WIDEN-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/widen_cast-4.ll b/llvm/test/CodeGen/X86/widen_cast-4.ll index c3fa2f5454e..fe8b7b7eee2 100644 --- a/llvm/test/CodeGen/X86/widen_cast-4.ll +++ b/llvm/test/CodeGen/X86/widen_cast-4.ll @@ -57,15 +57,13 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { ; WIDE-NEXT: movl %edx, {{[0-9]+}}(%esp) ; WIDE-NEXT: addl {{[0-9]+}}(%esp), %ecx ; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; WIDE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; WIDE-NEXT: pinsrd $1, 4(%ecx,%eax,8), %xmm3 +; WIDE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; WIDE-NEXT: psubb %xmm0, %xmm3 ; WIDE-NEXT: psrlw $2, %xmm3 ; WIDE-NEXT: pand %xmm1, %xmm3 ; WIDE-NEXT: pxor %xmm2, %xmm3 ; WIDE-NEXT: psubb %xmm2, %xmm3 -; WIDE-NEXT: pextrd $1, %xmm3, 4(%edx,%eax,8) -; WIDE-NEXT: movd %xmm3, (%edx,%eax,8) +; WIDE-NEXT: movq %xmm3, (%edx,%eax,8) ; WIDE-NEXT: incl (%esp) ; WIDE-NEXT: .LBB0_1: # %forcond ; WIDE-NEXT: # =>This Inner Loop Header: Depth=1 |

