diff options
author | Craig Topper <craig.topper@gmail.com> | 2020-01-05 17:01:57 -0800 |
---|---|---|
committer | Craig Topper <craig.topper@gmail.com> | 2020-01-05 17:44:08 -0800 |
commit | 95840866b7d86794490ed46278fa9a9d798ee5bf (patch) | |
tree | ced14eecbbdc4e79ccf46aa1f34b97f053a70715 | |
parent | ca3bf289a7f38b651280d8c6e784d4045a42b580 (diff) | |
download | bcm5719-llvm-95840866b7d86794490ed46278fa9a9d798ee5bf.tar.gz bcm5719-llvm-95840866b7d86794490ed46278fa9a9d798ee5bf.zip |
[X86] Improve v2i64->v2f32 and v4i64->v4f32 uint_to_fp on avx and avx2 targets.
Summary:
Based on Simon's D52965, but improved to handle strict fp and improve some of the shuffling.
Rather than use v2i1/v4i1 and let type legalization continue, just generate all the code with legal types and use an explicit shuffle.
I also added an explicit setcc to the v4i64 code to match the semantics of vselect which doesn't just use the sign bit. I'm also using a v4i64->v4i32 truncate instead of the shuffle in Simon's original code. With the setcc this will become a pack.
Future work can look into using X86ISD::BLENDV and a different shuffle that only moves the sign bit.
Reviewers: RKSimon, spatel
Reviewed By: RKSimon
Subscribers: hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D71956
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 149 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll | 41 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll | 150 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vec_int_to_fp.ll | 690 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll | 122 |
5 files changed, 514 insertions, 638 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 79c06af2db0..7da8aaa5f7b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1176,6 +1176,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal); + if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { + // We need to mark SINT_TO_FP as Custom even though we want to expand it + // so that DAG combine doesn't try to turn it into uint_to_fp. + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom); + } + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); @@ -18620,42 +18629,91 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - assert(Subtarget.hasDQI() && !Subtarget.hasVLX() && "Unexpected features"); - SDLoc DL(Op); bool IsStrict = Op->isStrictFPOpcode(); MVT VT = Op->getSimpleValueType(0); SDValue Src = Op->getOperand(IsStrict ? 1 : 0); - assert((Src.getSimpleValueType() == MVT::v2i64 || - Src.getSimpleValueType() == MVT::v4i64) && - "Unsupported custom type"); - // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type. - assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && - "Unexpected VT!"); - MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; + if (Subtarget.hasDQI()) { + assert(!Subtarget.hasVLX() && "Unexpected features"); - // Need to concat with zero vector for strict fp to avoid spurious - // exceptions. - SDValue Tmp = - IsStrict ? DAG.getConstant(0, DL, MVT::v8i64) : DAG.getUNDEF(MVT::v8i64); - Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src, - DAG.getIntPtrConstant(0, DL)); - SDValue Res, Chain; + assert((Src.getSimpleValueType() == MVT::v2i64 || + Src.getSimpleValueType() == MVT::v4i64) && + "Unsupported custom type"); + + // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type. + assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && + "Unexpected VT!"); + MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; + + // Need to concat with zero vector for strict fp to avoid spurious + // exceptions. + SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64) + : DAG.getUNDEF(MVT::v8i64); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src, + DAG.getIntPtrConstant(0, DL)); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other}, + {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src); + } + + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + return Res; + } + + bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP || + Op->getOpcode() == ISD::STRICT_SINT_TO_FP; + if (VT != MVT::v4f32 || IsSigned) + return SDValue(); + + SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64); + SDValue One = DAG.getConstant(1, DL, MVT::v4i64); + SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64, + DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One), + DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One)); + SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT); + SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src); + SmallVector<SDValue, 4> SignCvts(4); + SmallVector<SDValue, 4> Chains(4); + for (int i = 0; i != 4; ++i) { + SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc, + DAG.getIntPtrConstant(i, DL)); + if (IsStrict) { + SignCvts[i] = + DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other}, + {Op.getOperand(0), Src}); + Chains[i] = SignCvts[i].getValue(1); + } else { + SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src); + } + } + SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts); + + SDValue Slow, Chain; if (IsStrict) { - Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other}, - {Op->getOperand(0), Src}); - Chain = Res.getValue(1); + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other}, + {Chain, SignCvt, SignCvt}); + Chain = Slow.getValue(1); } else { - Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src); + Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt); } - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, - DAG.getIntPtrConstant(0, DL)); + IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg); + SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt); if (IsStrict) - return DAG.getMergeValues({Res, Chain}, DL); - return Res; + return DAG.getMergeValues({Cvt, Chain}, DL); + + return Cvt; } SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, @@ -29011,6 +29069,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() && + Subtarget.hasAVX() && !Subtarget.hasAVX512()) { + // TODO Any SSE41+ subtarget should work here but BLENDV codegen ends up + // a lot worse than it should be. + SDValue Zero = DAG.getConstant(0, dl, SrcVT); + SDValue One = DAG.getConstant(1, dl, SrcVT); + SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT, + DAG.getNode(ISD::SRL, dl, SrcVT, Src, One), + DAG.getNode(ISD::AND, dl, SrcVT, Src, One)); + SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT); + SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src); + SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32)); + for (int i = 0; i != 2; ++i) { + SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, + SignSrc, DAG.getIntPtrConstant(i, dl)); + if (IsStrict) + SignCvts[i] = + DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other}, + {N->getOperand(0), Src}); + else + SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src); + }; + SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts); + SDValue Slow, Chain; + if (IsStrict) { + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + SignCvts[0].getValue(1), SignCvts[1].getValue(1)); + Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other}, + {Chain, SignCvt, SignCvt}); + Chain = Slow.getValue(1); + } else { + Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt); + } + IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg); + IsNeg = + DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1}); + SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt); + Results.push_back(Cvt); + if (IsStrict) + Results.push_back(Chain); + return; + } + if (SrcVT != MVT::v2i32) return; diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll index 5af45777e76..fbb5ac08921 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -311,33 +311,20 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; ; AVX1-64-LABEL: uitofp_v2i64_v2f32: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-64-NEXT: movq %rax, %rcx -; AVX1-64-NEXT: shrq %rcx -; AVX1-64-NEXT: movl %eax, %edx -; AVX1-64-NEXT: andl $1, %edx -; AVX1-64-NEXT: orq %rcx, %rdx -; AVX1-64-NEXT: testq %rax, %rax -; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 -; AVX1-64-NEXT: jns .LBB3_2 -; AVX1-64-NEXT: # %bb.1: -; AVX1-64-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-64-NEXT: .LBB3_2: -; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: movq %rax, %rcx -; AVX1-64-NEXT: shrq %rcx -; AVX1-64-NEXT: movl %eax, %edx -; AVX1-64-NEXT: andl $1, %edx -; AVX1-64-NEXT: orq %rcx, %rdx -; AVX1-64-NEXT: testq %rax, %rax -; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm0 -; AVX1-64-NEXT: jns .LBB3_4 -; AVX1-64-NEXT: # %bb.3: -; AVX1-64-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-64-NEXT: .LBB3_4: -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-64-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-64-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 +; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-64-NEXT: vmovq %xmm1, %rax +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero +; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-64-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-64-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1-64-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-64-NEXT: retq ; ; AVX512F-64-LABEL: uitofp_v2i64_v2f32: diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll index ffeaf3b85ac..3a7a1b6ce5d 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -1058,123 +1058,59 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; ; AVX1-64-LABEL: uitofp_v4i64_v4f32: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-64-NEXT: movq %rax, %rcx -; AVX1-64-NEXT: shrq %rcx -; AVX1-64-NEXT: movl %eax, %edx -; AVX1-64-NEXT: andl $1, %edx -; AVX1-64-NEXT: orq %rcx, %rdx -; AVX1-64-NEXT: testq %rax, %rax -; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 -; AVX1-64-NEXT: jns .LBB19_2 -; AVX1-64-NEXT: # %bb.1: -; AVX1-64-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-64-NEXT: .LBB19_2: -; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: movq %rax, %rcx -; AVX1-64-NEXT: shrq %rcx -; AVX1-64-NEXT: movl %eax, %edx -; AVX1-64-NEXT: andl $1, %edx -; AVX1-64-NEXT: orq %rcx, %rdx -; AVX1-64-NEXT: testq %rax, %rax -; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm2 -; AVX1-64-NEXT: jns .LBB19_4 -; AVX1-64-NEXT: # %bb.3: -; AVX1-64-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-64-NEXT: .LBB19_4: -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: movq %rax, %rcx -; AVX1-64-NEXT: shrq %rcx -; AVX1-64-NEXT: movl %eax, %edx -; AVX1-64-NEXT: andl $1, %edx -; AVX1-64-NEXT: orq %rcx, %rdx -; AVX1-64-NEXT: testq %rax, %rax -; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm2 -; AVX1-64-NEXT: jns .LBB19_6 -; AVX1-64-NEXT: # %bb.5: -; AVX1-64-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-64-NEXT: .LBB19_6: -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-64-NEXT: movq %rax, %rcx -; AVX1-64-NEXT: shrq %rcx -; AVX1-64-NEXT: movl %eax, %edx -; AVX1-64-NEXT: andl $1, %edx -; AVX1-64-NEXT: orq %rcx, %rdx -; AVX1-64-NEXT: testq %rax, %rax -; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm0 -; AVX1-64-NEXT: jns .LBB19_8 -; AVX1-64-NEXT: # %bb.7: -; AVX1-64-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-64-NEXT: .LBB19_8: -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-64-NEXT: vpsrlq $1, %xmm0, %xmm1 +; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-64-NEXT: vpsrlq $1, %xmm2, %xmm3 +; AVX1-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-64-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm3 +; AVX1-64-NEXT: vorpd %ymm3, %ymm1, %ymm1 +; AVX1-64-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm3 +; AVX1-64-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; AVX1-64-NEXT: vmovq %xmm3, %rax +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-64-NEXT: vblendvpd %xmm2, %xmm1, %xmm2, %xmm1 +; AVX1-64-NEXT: vmovq %xmm1, %rax +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm3 +; AVX1-64-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-64-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-64-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq ; ; AVX2-64-LABEL: uitofp_v4i64_v4f32: ; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-64-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 +; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm2 +; AVX2-64-NEXT: vpsrlq $1, %ymm0, %ymm3 +; AVX2-64-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-64-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-64-NEXT: movq %rax, %rcx -; AVX2-64-NEXT: shrq %rcx -; AVX2-64-NEXT: movl %eax, %edx -; AVX2-64-NEXT: andl $1, %edx -; AVX2-64-NEXT: orq %rcx, %rdx -; AVX2-64-NEXT: testq %rax, %rax -; AVX2-64-NEXT: cmovnsq %rax, %rdx -; AVX2-64-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 -; AVX2-64-NEXT: jns .LBB19_2 -; AVX2-64-NEXT: # %bb.1: -; AVX2-64-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX2-64-NEXT: .LBB19_2: +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: movq %rax, %rcx -; AVX2-64-NEXT: shrq %rcx -; AVX2-64-NEXT: movl %eax, %edx -; AVX2-64-NEXT: andl $1, %edx -; AVX2-64-NEXT: orq %rcx, %rdx -; AVX2-64-NEXT: testq %rax, %rax -; AVX2-64-NEXT: cmovnsq %rax, %rdx -; AVX2-64-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm2 -; AVX2-64-NEXT: jns .LBB19_4 -; AVX2-64-NEXT: # %bb.3: -; AVX2-64-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-64-NEXT: .LBB19_4: -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: movq %rax, %rcx -; AVX2-64-NEXT: shrq %rcx -; AVX2-64-NEXT: movl %eax, %edx -; AVX2-64-NEXT: andl $1, %edx -; AVX2-64-NEXT: orq %rcx, %rdx -; AVX2-64-NEXT: testq %rax, %rax -; AVX2-64-NEXT: cmovnsq %rax, %rdx -; AVX2-64-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm2 -; AVX2-64-NEXT: jns .LBB19_6 -; AVX2-64-NEXT: # %bb.5: -; AVX2-64-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-64-NEXT: .LBB19_6: -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-64-NEXT: movq %rax, %rcx -; AVX2-64-NEXT: shrq %rcx -; AVX2-64-NEXT: movl %eax, %edx -; AVX2-64-NEXT: andl $1, %edx -; AVX2-64-NEXT: orq %rcx, %rdx -; AVX2-64-NEXT: testq %rax, %rax -; AVX2-64-NEXT: cmovnsq %rax, %rdx -; AVX2-64-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm0 -; AVX2-64-NEXT: jns .LBB19_8 -; AVX2-64-NEXT: # %bb.7: -; AVX2-64-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX2-64-NEXT: .LBB19_8: -; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX2-64-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX2-64-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-64-NEXT: vzeroupper ; AVX2-64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index b227bdd6945..ddefae2ed33 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1924,35 +1924,20 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; ; VEX-LABEL: uitofp_2i64_to_4f32: ; VEX: # %bb.0: -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB41_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB41_3 -; VEX-NEXT: .LBB41_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB41_3: -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB41_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq -; VEX-NEXT: .LBB41_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; VEX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; VEX-NEXT: vpsrlq $1, %xmm0, %xmm2 +; VEX-NEXT: vpor %xmm1, %xmm2, %xmm1 +; VEX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero +; VEX-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; VEX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; VEX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; VEX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i64_to_4f32: @@ -2071,35 +2056,21 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; ; VEX-LABEL: uitofp_2i64_to_2f32: ; VEX: # %bb.0: -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB42_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB42_3 -; VEX-NEXT: .LBB42_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB42_3: -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB42_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; VEX-NEXT: retq -; VEX-NEXT: .LBB42_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; VEX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; VEX-NEXT: vpsrlq $1, %xmm0, %xmm2 +; VEX-NEXT: vpor %xmm1, %xmm2, %xmm1 +; VEX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero +; VEX-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; VEX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; VEX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; VEX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i64_to_2f32: @@ -2211,38 +2182,60 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_4i64_to_4f32_undef: -; VEX: # %bb.0: -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB43_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB43_3 -; VEX-NEXT: .LBB43_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB43_3: -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB43_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq -; VEX-NEXT: .LBB43_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_4i64_to_4f32_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vcvtdq2ps %xmm1, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %eax, %xmm4, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_4i64_to_4f32_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] +; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_4i64_to_4f32_undef: ; AVX512F: # %bb.0: @@ -2636,133 +2629,59 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; ; AVX1-LABEL: uitofp_4i64_to_4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB49_1 -; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-NEXT: jmp .LBB49_3 -; AVX1-NEXT: .LBB49_1: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB49_3: -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB49_4 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: jmp .LBB49_6 -; AVX1-NEXT: .LBB49_4: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB49_6: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB49_7 -; AVX1-NEXT: # %bb.8: -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: jmp .LBB49_9 -; AVX1-NEXT: .LBB49_7: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB49_9: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB49_10 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB49_10: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm3 +; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: uitofp_4i64_to_4f32: ; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB49_1 -; AVX2-NEXT: # %bb.2: -; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX2-NEXT: jmp .LBB49_3 -; AVX2-NEXT: .LBB49_1: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: .LBB49_3: +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB49_4 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX2-NEXT: jmp .LBB49_6 -; AVX2-NEXT: .LBB49_4: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB49_6: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB49_7 -; AVX2-NEXT: # %bb.8: -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: jmp .LBB49_9 -; AVX2-NEXT: .LBB49_7: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB49_9: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB49_10 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB49_10: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4649,70 +4568,66 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_load_4i64_to_4f32: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm2 -; VEX-NEXT: vmovaps 16(%rdi), %xmm0 -; VEX-NEXT: vpextrq $1, %xmm2, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB83_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB83_3 -; VEX-NEXT: .LBB83_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB83_3: -; VEX-NEXT: vmovq %xmm2, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB83_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: jmp .LBB83_6 -; VEX-NEXT: .LBB83_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB83_6: -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB83_7 -; VEX-NEXT: # %bb.8: -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: jmp .LBB83_9 -; VEX-NEXT: .LBB83_7: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB83_9: -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB83_10 -; VEX-NEXT: # %bb.11: -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; VEX-NEXT: retq -; VEX-NEXT: .LBB83_10: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_load_4i64_to_4f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vorpd %ymm0, %ymm3, %ymm0 +; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm1, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm3 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vblendvps %xmm1, %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_4i64_to_4f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm3 +; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_4i64_to_4f32: ; AVX512F: # %bb.0: @@ -5168,132 +5083,113 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_load_8i64_to_8f32: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm1 -; VEX-NEXT: vmovaps 16(%rdi), %xmm0 -; VEX-NEXT: vmovdqa 32(%rdi), %xmm4 -; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 -; VEX-NEXT: vpextrq $1, %xmm4, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB87_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; VEX-NEXT: jmp .LBB87_3 -; VEX-NEXT: .LBB87_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB87_3: -; VEX-NEXT: vmovq %xmm4, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB87_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 -; VEX-NEXT: jmp .LBB87_6 -; VEX-NEXT: .LBB87_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm5 -; VEX-NEXT: .LBB87_6: -; VEX-NEXT: vmovq %xmm3, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB87_7 -; VEX-NEXT: # %bb.8: -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 -; VEX-NEXT: jmp .LBB87_9 -; VEX-NEXT: .LBB87_7: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 -; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4 -; VEX-NEXT: .LBB87_9: -; VEX-NEXT: vpextrq $1, %xmm3, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB87_10 -; VEX-NEXT: # %bb.11: -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 -; VEX-NEXT: jmp .LBB87_12 -; VEX-NEXT: .LBB87_10: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 -; VEX-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; VEX-NEXT: .LBB87_12: -; VEX-NEXT: vpextrq $1, %xmm1, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB87_13 -; VEX-NEXT: # %bb.14: -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 -; VEX-NEXT: jmp .LBB87_15 -; VEX-NEXT: .LBB87_13: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 -; VEX-NEXT: vaddss %xmm6, %xmm6, %xmm6 -; VEX-NEXT: .LBB87_15: -; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3] -; VEX-NEXT: vmovq %xmm1, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB87_16 -; VEX-NEXT: # %bb.17: -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 -; VEX-NEXT: jmp .LBB87_18 -; VEX-NEXT: .LBB87_16: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB87_18: -; VEX-NEXT: vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3] -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3] -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB87_19 -; VEX-NEXT: # %bb.20: -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 -; VEX-NEXT: jmp .LBB87_21 -; VEX-NEXT: .LBB87_19: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 -; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB87_21: -; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3] -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB87_22 -; VEX-NEXT: # %bb.23: -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 -; VEX-NEXT: jmp .LBB87_24 -; VEX-NEXT: .LBB87_22: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: .LBB87_24: -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_load_8i64_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [1,1,1,1] +; AVX1-NEXT: vandpd 32(%rdi), %ymm2, %ymm3 +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm5 +; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm6 +; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm7 +; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-NEXT: vorpd %ymm3, %ymm6, %ymm3 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm4, %xmm6 +; AVX1-NEXT: vpextrq $1, %xmm6, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm7 +; AVX1-NEXT: vmovq %xmm6, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm6 +; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm7 +; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0,1],xmm7[0],xmm6[3] +; AVX1-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[0] +; AVX1-NEXT: vaddps %xmm3, %xmm3, %xmm6 +; AVX1-NEXT: vxorps %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vblendvps %xmm4, %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vandpd (%rdi), %ymm2, %ymm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm4 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vorpd %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpextrq $1, %xmm4, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm5 +; AVX1-NEXT: vmovq %xmm4, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vmovq %xmm2, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm5 +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0],xmm4[3] +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm8, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[0] +; AVX1-NEXT: vaddps %xmm2, %xmm2, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm4, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_8i64_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1] +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm6 +; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vblendvpd %ymm1, %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm5 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 +; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[0] +; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm5 +; AVX2-NEXT: vblendvps %xmm3, %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm3 +; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm4 +; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] +; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm3 +; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_8i64_to_8f32: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 80204aa4390..83ddd78564f 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -6941,33 +6941,20 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 { ; ; AVX1-LABEL: constrained_vector_uitofp_v2f32_v2i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 -; AVX1-NEXT: jns .LBB174_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB174_2: # %entry -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm0 -; AVX1-NEXT: jns .LBB174_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: .LBB174_4: # %entry -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm2 +; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i64: @@ -7471,62 +7458,31 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; ; AVX1-LABEL: constrained_vector_uitofp_v4f32_v4i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 -; AVX1-NEXT: jns .LBB182_2 -; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB182_2: # %entry -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm2 -; AVX1-NEXT: jns .LBB182_4 -; AVX1-NEXT: # %bb.3: -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB182_4: # %entry -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm2 -; AVX1-NEXT: jns .LBB182_6 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB182_6: # %entry -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: andl $1, %edx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm0 -; AVX1-NEXT: jns .LBB182_8 -; AVX1-NEXT: # %bb.7: -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: .LBB182_8: # %entry -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm3 +; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm3 +; AVX1-NEXT: vpextrq $1, %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] +; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; |