diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 59 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx1-logical-load-folding.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-hadd-hsub.ll | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll | 40 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/exedeps-movq.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/extractelement-fp.ll | 81 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/ftrunc.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/haddsub.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/scalar-int-to-fp.ll | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vec_extract.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll | 168 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll | 168 | 
12 files changed, 306 insertions, 258 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ac49ba02351..de666798b03 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34240,6 +34240,62 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,    return SDValue();  } +/// Extracting a scalar FP value from vector element 0 is free, so extract each +/// operand first, then perform the math as a scalar op. +static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { +  assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract"); +  SDValue Vec = ExtElt->getOperand(0); +  SDValue Index = ExtElt->getOperand(1); +  EVT VT = ExtElt->getValueType(0); +  EVT VecVT = Vec.getValueType(); + +  // TODO: If this is a unary/expensive/expand op, allow extraction from a +  // non-zero element because the shuffle+scalar op will be cheaper? +  if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT) +    return SDValue(); + +  if (VT != MVT::f32 && VT != MVT::f64) +    return SDValue(); + +  // TODO: This switch could include FNEG, the x86-specific FP logic ops +  // (FAND, FANDN, FOR, FXOR), FRSQRT/FRCP and other FP math ops. But that may +  // require enhancements to avoid missed load folding and fma+fneg combining. +  switch (Vec.getOpcode()) { +  case ISD::FMA: // Begin 3 operands +  case ISD::FMAD: +  case ISD::FADD: // Begin 2 operands +  case ISD::FSUB: +  case ISD::FMUL: +  case ISD::FDIV: +  case ISD::FREM: +  case ISD::FCOPYSIGN: +  case ISD::FMINNUM: +  case ISD::FMAXNUM: +  case ISD::FMINNUM_IEEE: +  case ISD::FMAXNUM_IEEE: +  case ISD::FMAXIMUM: +  case ISD::FMINIMUM: +  case ISD::FABS: // Begin 1 operand +  case ISD::FSQRT: +  case ISD::FRINT: +  case ISD::FCEIL: +  case ISD::FTRUNC: +  case ISD::FNEARBYINT: +  case ISD::FROUND: +  case ISD::FFLOOR: { +    // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ... +    SDLoc DL(ExtElt); +    SmallVector<SDValue, 4> ExtOps; +    for (SDValue Op : Vec->ops()) +      ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index)); +    return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps); +  } +  default: +    return SDValue(); +  } +  llvm_unreachable("All opcodes should return within switch"); +} +  /// Detect vector gather/scatter index generation and convert it from being a  /// bunch of shuffles and extracts into a somewhat faster sequence.  /// For i686, the best sequence is apparently storing the value and loading @@ -34310,6 +34366,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,    if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))      return MinMax; +  if (SDValue V = scalarizeExtEltFP(N, DAG)) +    return V; +    return SDValue();  } diff --git a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll index c1ee3182fb1..0871b9663e3 100644 --- a/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll +++ b/llvm/test/CodeGen/X86/avx1-logical-load-folding.ll @@ -8,14 +8,14 @@ define void @test1(float* %A, float* %C) #0 {  ; X86:       ## %bb.0:  ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax  ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT:    vmovaps (%ecx), %xmm0 +; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero  ; X86-NEXT:    vandps LCPI0_0, %xmm0, %xmm0  ; X86-NEXT:    vmovss %xmm0, (%eax)  ; X86-NEXT:    retl  ;  ; X64-LABEL: test1:  ; X64:       ## %bb.0: -; X64-NEXT:    vmovaps (%rdi), %xmm0 +; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero  ; X64-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0  ; X64-NEXT:    vmovss %xmm0, (%rsi)  ; X64-NEXT:    retq diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll index 32d5299a88a..f16d3bb7a24 100644 --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -62,7 +62,7 @@ define float @fhadd_16(<16 x float> %x225) {  ; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; KNL-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; KNL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; KNL-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; KNL-NEXT:    retq  ;  ; SKX-LABEL: fhadd_16: @@ -70,7 +70,7 @@ define float @fhadd_16(<16 x float> %x225) {  ; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; SKX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; SKX-NEXT:    vzeroupper  ; SKX-NEXT:    retq    %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> @@ -87,7 +87,7 @@ define float @fhsub_16(<16 x float> %x225) {  ; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; KNL-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; KNL-NEXT:    vsubps %xmm1, %xmm0, %xmm0 +; KNL-NEXT:    vsubss %xmm1, %xmm0, %xmm0  ; KNL-NEXT:    retq  ;  ; SKX-LABEL: fhsub_16: @@ -95,7 +95,7 @@ define float @fhsub_16(<16 x float> %x225) {  ; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; SKX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SKX-NEXT:    vsubps %xmm1, %xmm0, %xmm0 +; SKX-NEXT:    vsubss %xmm1, %xmm0, %xmm0  ; SKX-NEXT:    vzeroupper  ; SKX-NEXT:    retq    %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> @@ -227,13 +227,13 @@ define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {  ; KNL-LABEL: fadd_noundef_eel:  ; KNL:       # %bb.0:  ; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; KNL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; KNL-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; KNL-NEXT:    retq  ;  ; SKX-LABEL: fadd_noundef_eel:  ; SKX:       # %bb.0:  ; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; SKX-NEXT:    vzeroupper  ; SKX-NEXT:    retq    %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 6a6fbe333aa..5b2c0a749c0 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -7289,8 +7289,8 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {  ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT:    vmovlpd %xmm0, (%esp) +; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 +; X86-NEXT:    vmovsd %xmm0, (%esp)  ; X86-NEXT:    fldl (%esp)  ; X86-NEXT:    movl %ebp, %esp  ; X86-NEXT:    popl %ebp @@ -7305,7 +7305,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) {  ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vzeroupper  ; X64-NEXT:    retq  entry: @@ -7336,8 +7336,8 @@ define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {  ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 -; X86-NEXT:    vmovlpd %xmm0, (%esp) +; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 +; X86-NEXT:    vmovsd %xmm0, (%esp)  ; X86-NEXT:    fldl (%esp)  ; X86-NEXT:    movl %ebp, %esp  ; X86-NEXT:    popl %ebp @@ -7352,7 +7352,7 @@ define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {  ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vzeroupper  ; X64-NEXT:    retq  entry: @@ -7380,7 +7380,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) {  ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vmovss %xmm0, (%esp)  ; X86-NEXT:    flds (%esp)  ; X86-NEXT:    popl %eax @@ -7397,7 +7397,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) {  ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vzeroupper  ; X64-NEXT:    retq  entry: @@ -7430,7 +7430,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {  ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vmovss %xmm0, (%esp)  ; X86-NEXT:    flds (%esp)  ; X86-NEXT:    popl %eax @@ -7447,7 +7447,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {  ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vzeroupper  ; X64-NEXT:    retq  entry: @@ -7486,8 +7486,8 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)  ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT:    vmovlpd %xmm0, (%esp) +; X86-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 +; X86-NEXT:    vmovsd %xmm0, (%esp)  ; X86-NEXT:    fldl (%esp)  ; X86-NEXT:    movl %ebp, %esp  ; X86-NEXT:    popl %ebp @@ -7504,7 +7504,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W)  ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; X64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vzeroupper  ; X64-NEXT:    retq  entry: @@ -7541,8 +7541,8 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W)  ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 -; X86-NEXT:    vmovlpd %xmm0, (%esp) +; X86-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 +; X86-NEXT:    vmovsd %xmm0, (%esp)  ; X86-NEXT:    fldl (%esp)  ; X86-NEXT:    movl %ebp, %esp  ; X86-NEXT:    popl %ebp @@ -7560,7 +7560,7 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W)  ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; X64-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vzeroupper  ; X64-NEXT:    retq  entry: @@ -7593,7 +7593,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W)  ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; X86-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vmovss %xmm0, (%esp)  ; X86-NEXT:    flds (%esp)  ; X86-NEXT:    popl %eax @@ -7612,7 +7612,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W)  ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vzeroupper  ; X64-NEXT:    retq  entry: @@ -7651,7 +7651,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)  ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; X86-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; X86-NEXT:    vmovss %xmm0, (%esp)  ; X86-NEXT:    flds (%esp)  ; X86-NEXT:    popl %eax @@ -7671,7 +7671,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)  ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; X64-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; X64-NEXT:    vzeroupper  ; X64-NEXT:    retq  entry: diff --git a/llvm/test/CodeGen/X86/exedeps-movq.ll b/llvm/test/CodeGen/X86/exedeps-movq.ll index cc56be672db..f907cd537eb 100644 --- a/llvm/test/CodeGen/X86/exedeps-movq.ll +++ b/llvm/test/CodeGen/X86/exedeps-movq.ll @@ -32,14 +32,14 @@ define void @store_floats(<4 x float> %x, i64* %p) {  define void @store_double(<2 x double> %x, i64* %p) {  ; SSE-LABEL: store_double:  ; SSE:       # %bb.0: -; SSE-NEXT:    addpd %xmm0, %xmm0 -; SSE-NEXT:    movlpd %xmm0, (%rdi) +; SSE-NEXT:    addsd %xmm0, %xmm0 +; SSE-NEXT:    movsd %xmm0, (%rdi)  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: store_double:  ; AVX:       # %bb.0: -; AVX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT:    vmovlpd %xmm0, (%rdi) +; AVX-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT:    vmovsd %xmm0, (%rdi)  ; AVX-NEXT:    retq    %a = fadd <2 x double> %x, %x    %b = extractelement <2 x double> %a, i32 0 diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll index 82d5a74b4e9..f9d34f471f5 100644 --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -27,7 +27,7 @@ define double @fneg_v4f64(<4 x double> %x) nounwind {  define float @fadd_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  ; CHECK-LABEL: fadd_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = fadd <4 x float> %x, %y    %r = extractelement <4 x float> %v, i32 0 @@ -37,7 +37,7 @@ define float @fadd_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  define double @fadd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  ; CHECK-LABEL: fadd_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = fadd <4 x double> %x, %y @@ -48,7 +48,7 @@ define double @fadd_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  define float @fsub_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  ; CHECK-LABEL: fsub_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vsubps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT:    vsubss %xmm1, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = fsub <4 x float> %x, %y    %r = extractelement <4 x float> %v, i32 0 @@ -58,7 +58,7 @@ define float @fsub_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  define double @fsub_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  ; CHECK-LABEL: fsub_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT:    vsubsd %xmm1, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = fsub <4 x double> %x, %y @@ -69,7 +69,7 @@ define double @fsub_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  define float @fmul_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  ; CHECK-LABEL: fmul_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = fmul <4 x float> %x, %y    %r = extractelement <4 x float> %v, i32 0 @@ -79,7 +79,7 @@ define float @fmul_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  define double @fmul_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  ; CHECK-LABEL: fmul_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = fmul <4 x double> %x, %y @@ -90,7 +90,7 @@ define double @fmul_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  define float @fdiv_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  ; CHECK-LABEL: fdiv_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vdivps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT:    vdivss %xmm1, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = fdiv <4 x float> %x, %y    %r = extractelement <4 x float> %v, i32 0 @@ -100,7 +100,7 @@ define float @fdiv_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  define double @fdiv_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  ; CHECK-LABEL: fdiv_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT:    vdivsd %xmm1, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = fdiv <4 x double> %x, %y @@ -132,7 +132,7 @@ define double @frem_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  define float @fsqrt_v4f32(<4 x float> %x) nounwind {  ; CHECK-LABEL: fsqrt_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vsqrtps %xmm0, %xmm0 +; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)    %r = extractelement <4 x float> %v, i32 0 @@ -142,8 +142,7 @@ define float @fsqrt_v4f32(<4 x float> %x) nounwind {  define double @fsqrt_v4f64(<4 x double> %x) nounwind {  ; CHECK-LABEL: fsqrt_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vsqrtpd %ymm0, %ymm0 -; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %x) @@ -174,7 +173,7 @@ define double @fsin_v4f64(<4 x double> %x) nounwind {  define float @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind {  ; CHECK-LABEL: fma_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 +; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2  ; CHECK-NEXT:    retq    %v = call <4 x float> @llvm.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z)    %r = extractelement <4 x float> %v, i32 0 @@ -184,7 +183,7 @@ define float @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind  define double @fma_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z) nounwind {  ; CHECK-LABEL: fma_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 +; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2  ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq @@ -207,8 +206,7 @@ define float @fabs_v4f32(<4 x float> %x) nounwind {  define double @fabs_v4f64(<4 x double> %x) nounwind {  ; CHECK-LABEL: fabs_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN] -; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) @@ -219,8 +217,8 @@ define double @fabs_v4f64(<4 x double> %x) nounwind {  define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  ; CHECK-LABEL: fmaxnum_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmaxps %xmm0, %xmm1, %xmm2 -; CHECK-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vmaxss %xmm0, %xmm1, %xmm2 +; CHECK-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0  ; CHECK-NEXT:    retq    %v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) @@ -231,10 +229,9 @@ define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  ; CHECK-LABEL: fmaxnum_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmaxpd %ymm0, %ymm1, %ymm2 -; CHECK-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 -; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2 +; CHECK-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) @@ -245,8 +242,8 @@ define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  ; CHECK-LABEL: fminnum_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vminps %xmm0, %xmm1, %xmm2 -; CHECK-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vminss %xmm0, %xmm1, %xmm2 +; CHECK-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0  ; CHECK-NEXT:    retq    %v = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) @@ -257,10 +254,9 @@ define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  ; CHECK-LABEL: fminnum_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vminpd %ymm0, %ymm1, %ymm2 -; CHECK-NEXT:    vcmpunordpd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 -; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT:    vminsd %xmm0, %xmm1, %xmm2 +; CHECK-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) @@ -309,10 +305,8 @@ define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind {  define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  ; CHECK-LABEL: copysign_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] -; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; CHECK-NEXT:    vandps %xmm3, %xmm1, %xmm1 -; CHECK-NEXT:    vandps %xmm2, %xmm0, %xmm0 +; CHECK-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0  ; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq @@ -324,7 +318,7 @@ define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind {  define float @floor_v4f32(<4 x float> %x) nounwind {  ; CHECK-LABEL: floor_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundps $9, %xmm0, %xmm0 +; CHECK-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = call <4 x float> @llvm.floor.v4f32(<4 x float> %x)    %r = extractelement <4 x float> %v, i32 0 @@ -334,8 +328,7 @@ define float @floor_v4f32(<4 x float> %x) nounwind {  define double @floor_v4f64(<4 x double> %x) nounwind {  ; CHECK-LABEL: floor_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundpd $9, %ymm0, %ymm0 -; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) @@ -346,7 +339,7 @@ define double @floor_v4f64(<4 x double> %x) nounwind {  define float @ceil_v4f32(<4 x float> %x) nounwind {  ; CHECK-LABEL: ceil_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundps $10, %xmm0, %xmm0 +; CHECK-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)    %r = extractelement <4 x float> %v, i32 0 @@ -356,8 +349,7 @@ define float @ceil_v4f32(<4 x float> %x) nounwind {  define double @ceil_v4f64(<4 x double> %x) nounwind {  ; CHECK-LABEL: ceil_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundpd $10, %ymm0, %ymm0 -; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) @@ -368,7 +360,7 @@ define double @ceil_v4f64(<4 x double> %x) nounwind {  define float @trunc_v4f32(<4 x float> %x) nounwind {  ; CHECK-LABEL: trunc_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundps $11, %xmm0, %xmm0 +; CHECK-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x)    %r = extractelement <4 x float> %v, i32 0 @@ -378,8 +370,7 @@ define float @trunc_v4f32(<4 x float> %x) nounwind {  define double @trunc_v4f64(<4 x double> %x) nounwind {  ; CHECK-LABEL: trunc_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundpd $11, %ymm0, %ymm0 -; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) @@ -390,7 +381,7 @@ define double @trunc_v4f64(<4 x double> %x) nounwind {  define float @rint_v4f32(<4 x float> %x) nounwind {  ; CHECK-LABEL: rint_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundps $4, %xmm0, %xmm0 +; CHECK-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = call <4 x float> @llvm.rint.v4f32(<4 x float> %x)    %r = extractelement <4 x float> %v, i32 0 @@ -400,8 +391,7 @@ define float @rint_v4f32(<4 x float> %x) nounwind {  define double @rint_v4f64(<4 x double> %x) nounwind {  ; CHECK-LABEL: rint_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundpd $4, %ymm0, %ymm0 -; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = call <4 x double> @llvm.rint.v4f64(<4 x double> %x) @@ -412,7 +402,7 @@ define double @rint_v4f64(<4 x double> %x) nounwind {  define float @nearbyint_v4f32(<4 x float> %x) nounwind {  ; CHECK-LABEL: nearbyint_v4f32:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundps $12, %xmm0, %xmm0 +; CHECK-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    retq    %v = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x)    %r = extractelement <4 x float> %v, i32 0 @@ -422,8 +412,7 @@ define float @nearbyint_v4f32(<4 x float> %x) nounwind {  define double @nearbyint_v4f64(<4 x double> %x) nounwind {  ; CHECK-LABEL: nearbyint_v4f64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vroundpd $12, %ymm0, %ymm0 -; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %v = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x) diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll index 26cea1d71f3..56cb6e252cb 100644 --- a/llvm/test/CodeGen/X86/ftrunc.ll +++ b/llvm/test/CodeGen/X86/ftrunc.ll @@ -43,7 +43,7 @@ define double @trunc_unsigned_f64(double %x) #0 {  ; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm1  ; SSE2-NEXT:    movapd %xmm1, %xmm0  ; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT:    addpd %xmm1, %xmm0 +; SSE2-NEXT:    addsd %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: trunc_unsigned_f64: diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll index 305da125a4d..1f6b3b1b947 100644 --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -1366,7 +1366,7 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {  ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]  ; SSE3-SLOW-NEXT:    addps %xmm1, %xmm2  ; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE3-SLOW-NEXT:    addps %xmm2, %xmm0 +; SSE3-SLOW-NEXT:    addss %xmm2, %xmm0  ; SSE3-SLOW-NEXT:    retq  ;  ; SSE3-FAST-LABEL: fadd_reduce_v8f32: @@ -1385,7 +1385,7 @@ define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) {  ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-SLOW-NEXT:    vzeroupper  ; AVX-SLOW-NEXT:    retq  ; @@ -1408,7 +1408,7 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {  ; SSE3-SLOW-NEXT:    addpd %xmm2, %xmm1  ; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0  ; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE3-SLOW-NEXT:    addpd %xmm1, %xmm0 +; SSE3-SLOW-NEXT:    addsd %xmm1, %xmm0  ; SSE3-SLOW-NEXT:    retq  ;  ; SSE3-FAST-LABEL: fadd_reduce_v4f64: @@ -1423,7 +1423,7 @@ define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {  ; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm0  ; AVX-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0  ; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-SLOW-NEXT:    vzeroupper  ; AVX-SLOW-NEXT:    retq  ; diff --git a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll index 0d3836c89f2..63889f602f6 100644 --- a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll +++ b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll @@ -638,8 +638,8 @@ define double @u64_to_d(i64 %a) nounwind {  ; SSE2_32-NEXT:    subpd {{\.LCPI.*}}, %xmm0  ; SSE2_32-NEXT:    movapd %xmm0, %xmm1  ; SSE2_32-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2_32-NEXT:    addpd %xmm0, %xmm1 -; SSE2_32-NEXT:    movlpd %xmm1, (%esp) +; SSE2_32-NEXT:    addsd %xmm0, %xmm1 +; SSE2_32-NEXT:    movsd %xmm1, (%esp)  ; SSE2_32-NEXT:    fldl (%esp)  ; SSE2_32-NEXT:    movl %ebp, %esp  ; SSE2_32-NEXT:    popl %ebp @@ -652,7 +652,7 @@ define double @u64_to_d(i64 %a) nounwind {  ; SSE2_64-NEXT:    subpd {{.*}}(%rip), %xmm1  ; SSE2_64-NEXT:    movapd %xmm1, %xmm0  ; SSE2_64-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2_64-NEXT:    addpd %xmm1, %xmm0 +; SSE2_64-NEXT:    addsd %xmm1, %xmm0  ; SSE2_64-NEXT:    retq  ;  ; X87-LABEL: u64_to_d: diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll index 98e2d32b0b0..724ac9032e3 100644 --- a/llvm/test/CodeGen/X86/vec_extract.ll +++ b/llvm/test/CodeGen/X86/vec_extract.ll @@ -7,15 +7,15 @@ define void @test1(<4 x float>* %F, float* %f) nounwind {  ; X32:       # %bb.0: # %entry  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT:    movaps (%ecx), %xmm0 -; X32-NEXT:    addps %xmm0, %xmm0 +; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT:    addss %xmm0, %xmm0  ; X32-NEXT:    movss %xmm0, (%eax)  ; X32-NEXT:    retl  ;  ; X64-LABEL: test1:  ; X64:       # %bb.0: # %entry -; X64-NEXT:    movaps (%rdi), %xmm0 -; X64-NEXT:    addps %xmm0, %xmm0 +; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT:    addss %xmm0, %xmm0  ; X64-NEXT:    movss %xmm0, (%rsi)  ; X64-NEXT:    retq  entry: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll index 82ec7adfab3..7344a34ba8a 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -15,25 +15,25 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {  ; SSE2:       # %bb.0:  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    addps %xmm1, %xmm0 +; SSE2-NEXT:    addss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v2f32:  ; SSE41:       # %bb.0:  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    addps %xmm1, %xmm0 +; SSE41-NEXT:    addss %xmm1, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v2f32:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f32:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)    ret float %1 @@ -47,7 +47,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {  ; SSE2-NEXT:    addps %xmm1, %xmm2  ; SSE2-NEXT:    movaps %xmm2, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT:    addps %xmm2, %xmm0 +; SSE2-NEXT:    addss %xmm2, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v4f32: @@ -56,7 +56,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]  ; SSE41-NEXT:    addps %xmm1, %xmm2  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT:    addps %xmm2, %xmm0 +; SSE41-NEXT:    addss %xmm2, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v4f32: @@ -64,7 +64,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]  ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v4f32: @@ -72,7 +72,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]  ; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)    ret float %1 @@ -87,7 +87,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {  ; SSE2-NEXT:    addps %xmm1, %xmm2  ; SSE2-NEXT:    movaps %xmm2, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT:    addps %xmm2, %xmm0 +; SSE2-NEXT:    addss %xmm2, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v8f32: @@ -97,7 +97,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]  ; SSE41-NEXT:    addps %xmm1, %xmm2  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT:    addps %xmm2, %xmm0 +; SSE41-NEXT:    addss %xmm2, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v8f32: @@ -107,7 +107,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -118,7 +118,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) @@ -136,7 +136,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {  ; SSE2-NEXT:    addps %xmm1, %xmm2  ; SSE2-NEXT:    movaps %xmm2, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT:    addps %xmm2, %xmm0 +; SSE2-NEXT:    addss %xmm2, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v16f32: @@ -148,7 +148,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]  ; SSE41-NEXT:    addps %xmm1, %xmm2  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT:    addps %xmm2, %xmm0 +; SSE41-NEXT:    addss %xmm2, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v16f32: @@ -159,7 +159,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -172,7 +172,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1) @@ -188,26 +188,26 @@ define float @test_v2f32_zero(<2 x float> %a0) {  ; SSE2:       # %bb.0:  ; SSE2-NEXT:    movaps %xmm0, %xmm1  ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT:    addps %xmm0, %xmm1 +; SSE2-NEXT:    addss %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v2f32_zero:  ; SSE41:       # %bb.0:  ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT:    addps %xmm1, %xmm0 +; SSE41-NEXT:    addss %xmm1, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v2f32_zero:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f32_zero:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)    ret float %1 @@ -221,7 +221,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {  ; SSE2-NEXT:    addps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    addps %xmm1, %xmm0 +; SSE2-NEXT:    addss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v4f32_zero: @@ -230,7 +230,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    addps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    addps %xmm0, %xmm1 +; SSE41-NEXT:    addss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -239,7 +239,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v4f32_zero: @@ -247,7 +247,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)    ret float %1 @@ -262,7 +262,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {  ; SSE2-NEXT:    addps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    addps %xmm1, %xmm0 +; SSE2-NEXT:    addss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v8f32_zero: @@ -272,7 +272,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    addps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    addps %xmm0, %xmm1 +; SSE41-NEXT:    addss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -283,7 +283,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -294,7 +294,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0) @@ -312,7 +312,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {  ; SSE2-NEXT:    addps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    addps %xmm1, %xmm0 +; SSE2-NEXT:    addss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v16f32_zero: @@ -324,7 +324,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    addps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    addps %xmm0, %xmm1 +; SSE41-NEXT:    addss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -336,7 +336,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -349,7 +349,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0) @@ -365,26 +365,26 @@ define float @test_v2f32_undef(<2 x float> %a0) {  ; SSE2:       # %bb.0:  ; SSE2-NEXT:    movaps %xmm0, %xmm1  ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT:    addps %xmm0, %xmm1 +; SSE2-NEXT:    addss %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v2f32_undef:  ; SSE41:       # %bb.0:  ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT:    addps %xmm1, %xmm0 +; SSE41-NEXT:    addss %xmm1, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v2f32_undef:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f32_undef:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)    ret float %1 @@ -398,7 +398,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {  ; SSE2-NEXT:    addps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    addps %xmm1, %xmm0 +; SSE2-NEXT:    addss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v4f32_undef: @@ -407,7 +407,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    addps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    addps %xmm0, %xmm1 +; SSE41-NEXT:    addss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -416,7 +416,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v4f32_undef: @@ -424,7 +424,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)    ret float %1 @@ -439,7 +439,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {  ; SSE2-NEXT:    addps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    addps %xmm1, %xmm0 +; SSE2-NEXT:    addss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v8f32_undef: @@ -449,7 +449,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    addps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    addps %xmm0, %xmm1 +; SSE41-NEXT:    addss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -460,7 +460,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -471,7 +471,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0) @@ -489,7 +489,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {  ; SSE2-NEXT:    addps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    addps %xmm1, %xmm0 +; SSE2-NEXT:    addss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v16f32_undef: @@ -501,7 +501,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    addps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    addps %xmm0, %xmm1 +; SSE41-NEXT:    addss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -513,7 +513,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -526,7 +526,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0) @@ -542,19 +542,19 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {  ; SSE:       # %bb.0:  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    addpd %xmm1, %xmm0 +; SSE-NEXT:    addsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v2f64:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f64:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)    ret double %1 @@ -566,7 +566,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {  ; SSE-NEXT:    addpd %xmm2, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    addpd %xmm1, %xmm0 +; SSE-NEXT:    addsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v4f64: @@ -574,7 +574,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {  ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0  ; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -583,7 +583,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {  ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0  ; AVX512-NEXT:    vaddpd %xmm0, %xmm1, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) @@ -598,7 +598,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {  ; SSE-NEXT:    addpd %xmm2, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    addpd %xmm1, %xmm0 +; SSE-NEXT:    addsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v8f64: @@ -607,7 +607,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -618,7 +618,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1) @@ -637,7 +637,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {  ; SSE-NEXT:    addpd %xmm1, %xmm4  ; SSE-NEXT:    movapd %xmm4, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT:    addpd %xmm4, %xmm0 +; SSE-NEXT:    addsd %xmm4, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v16f64: @@ -648,7 +648,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -660,7 +660,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1) @@ -676,20 +676,20 @@ define double @test_v2f64_zero(<2 x double> %a0) {  ; SSE:       # %bb.0:  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    addpd %xmm0, %xmm1 +; SSE-NEXT:    addsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v2f64_zero:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f64_zero:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)    ret double %1 @@ -701,7 +701,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {  ; SSE-NEXT:    addpd %xmm1, %xmm0  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    addpd %xmm0, %xmm1 +; SSE-NEXT:    addsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ; @@ -710,7 +710,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -719,7 +719,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0) @@ -734,7 +734,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {  ; SSE-NEXT:    addpd %xmm1, %xmm0  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    addpd %xmm0, %xmm1 +; SSE-NEXT:    addsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ; @@ -744,7 +744,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -755,7 +755,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0) @@ -774,7 +774,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {  ; SSE-NEXT:    addpd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    addpd %xmm1, %xmm0 +; SSE-NEXT:    addsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v16f64_zero: @@ -785,7 +785,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -797,7 +797,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0) @@ -813,20 +813,20 @@ define double @test_v2f64_undef(<2 x double> %a0) {  ; SSE:       # %bb.0:  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    addpd %xmm0, %xmm1 +; SSE-NEXT:    addsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v2f64_undef:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f64_undef:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)    ret double %1 @@ -838,7 +838,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {  ; SSE-NEXT:    addpd %xmm1, %xmm0  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    addpd %xmm0, %xmm1 +; SSE-NEXT:    addsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ; @@ -847,7 +847,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -856,7 +856,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0) @@ -871,7 +871,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {  ; SSE-NEXT:    addpd %xmm1, %xmm0  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    addpd %xmm0, %xmm1 +; SSE-NEXT:    addsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ; @@ -881,7 +881,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -892,7 +892,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0) @@ -911,7 +911,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {  ; SSE-NEXT:    addpd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    addpd %xmm1, %xmm0 +; SSE-NEXT:    addsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v16f64_undef: @@ -922,7 +922,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -934,7 +934,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0) diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll index 971d7f0478e..3cd94151aca 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -15,25 +15,25 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {  ; SSE2:       # %bb.0:  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    mulps %xmm1, %xmm0 +; SSE2-NEXT:    mulss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v2f32:  ; SSE41:       # %bb.0:  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    mulps %xmm1, %xmm0 +; SSE41-NEXT:    mulss %xmm1, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v2f32:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0 +; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f32:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)    ret float %1 @@ -47,7 +47,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {  ; SSE2-NEXT:    mulps %xmm1, %xmm2  ; SSE2-NEXT:    movaps %xmm2, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT:    mulps %xmm2, %xmm0 +; SSE2-NEXT:    mulss %xmm2, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v4f32: @@ -56,7 +56,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]  ; SSE41-NEXT:    mulps %xmm1, %xmm2  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT:    mulps %xmm2, %xmm0 +; SSE41-NEXT:    mulss %xmm2, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v4f32: @@ -64,7 +64,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]  ; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v4f32: @@ -72,7 +72,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]  ; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)    ret float %1 @@ -87,7 +87,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {  ; SSE2-NEXT:    mulps %xmm1, %xmm2  ; SSE2-NEXT:    movaps %xmm2, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT:    mulps %xmm2, %xmm0 +; SSE2-NEXT:    mulss %xmm2, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v8f32: @@ -97,7 +97,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]  ; SSE41-NEXT:    mulps %xmm1, %xmm2  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT:    mulps %xmm2, %xmm0 +; SSE41-NEXT:    mulss %xmm2, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v8f32: @@ -107,7 +107,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -118,7 +118,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1) @@ -136,7 +136,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {  ; SSE2-NEXT:    mulps %xmm1, %xmm2  ; SSE2-NEXT:    movaps %xmm2, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3] -; SSE2-NEXT:    mulps %xmm2, %xmm0 +; SSE2-NEXT:    mulss %xmm2, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v16f32: @@ -148,7 +148,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]  ; SSE41-NEXT:    mulps %xmm1, %xmm2  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE41-NEXT:    mulps %xmm2, %xmm0 +; SSE41-NEXT:    mulss %xmm2, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v16f32: @@ -159,7 +159,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -172,7 +172,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1) @@ -188,26 +188,26 @@ define float @test_v2f32_zero(<2 x float> %a0) {  ; SSE2:       # %bb.0:  ; SSE2-NEXT:    movaps %xmm0, %xmm1  ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT:    mulps %xmm0, %xmm1 +; SSE2-NEXT:    mulss %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v2f32_zero:  ; SSE41:       # %bb.0:  ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT:    mulps %xmm1, %xmm0 +; SSE41-NEXT:    mulss %xmm1, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v2f32_zero:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f32_zero:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)    ret float %1 @@ -221,7 +221,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {  ; SSE2-NEXT:    mulps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    mulps %xmm1, %xmm0 +; SSE2-NEXT:    mulss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v4f32_zero: @@ -230,7 +230,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    mulps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    mulps %xmm0, %xmm1 +; SSE41-NEXT:    mulss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -239,7 +239,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v4f32_zero: @@ -247,7 +247,7 @@ define float @test_v4f32_zero(<4 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)    ret float %1 @@ -262,7 +262,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {  ; SSE2-NEXT:    mulps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    mulps %xmm1, %xmm0 +; SSE2-NEXT:    mulss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v8f32_zero: @@ -272,7 +272,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    mulps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    mulps %xmm0, %xmm1 +; SSE41-NEXT:    mulss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -283,7 +283,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -294,7 +294,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0) @@ -312,7 +312,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {  ; SSE2-NEXT:    mulps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    mulps %xmm1, %xmm0 +; SSE2-NEXT:    mulss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v16f32_zero: @@ -324,7 +324,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    mulps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    mulps %xmm0, %xmm1 +; SSE41-NEXT:    mulss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -336,7 +336,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -349,7 +349,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0) @@ -365,26 +365,26 @@ define float @test_v2f32_undef(<2 x float> %a0) {  ; SSE2:       # %bb.0:  ; SSE2-NEXT:    movaps %xmm0, %xmm1  ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE2-NEXT:    mulps %xmm0, %xmm1 +; SSE2-NEXT:    mulss %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v2f32_undef:  ; SSE41:       # %bb.0:  ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE41-NEXT:    mulps %xmm1, %xmm0 +; SSE41-NEXT:    mulss %xmm1, %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test_v2f32_undef:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f32_undef:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)    ret float %1 @@ -398,7 +398,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {  ; SSE2-NEXT:    mulps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    mulps %xmm1, %xmm0 +; SSE2-NEXT:    mulss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v4f32_undef: @@ -407,7 +407,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    mulps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    mulps %xmm0, %xmm1 +; SSE41-NEXT:    mulss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -416,7 +416,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v4f32_undef: @@ -424,7 +424,7 @@ define float @test_v4f32_undef(<4 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)    ret float %1 @@ -439,7 +439,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {  ; SSE2-NEXT:    mulps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    mulps %xmm1, %xmm0 +; SSE2-NEXT:    mulss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v8f32_undef: @@ -449,7 +449,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    mulps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    mulps %xmm0, %xmm1 +; SSE41-NEXT:    mulss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -460,7 +460,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -471,7 +471,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0) @@ -489,7 +489,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {  ; SSE2-NEXT:    mulps %xmm0, %xmm1  ; SSE2-NEXT:    movaps %xmm1, %xmm0  ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] -; SSE2-NEXT:    mulps %xmm1, %xmm0 +; SSE2-NEXT:    mulss %xmm1, %xmm0  ; SSE2-NEXT:    retq  ;  ; SSE41-LABEL: test_v16f32_undef: @@ -501,7 +501,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {  ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]  ; SSE41-NEXT:    mulps %xmm0, %xmm1  ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE41-NEXT:    mulps %xmm0, %xmm1 +; SSE41-NEXT:    mulss %xmm0, %xmm1  ; SSE41-NEXT:    movaps %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; @@ -513,7 +513,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -526,7 +526,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]  ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0) @@ -542,19 +542,19 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {  ; SSE:       # %bb.0:  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    mulpd %xmm1, %xmm0 +; SSE-NEXT:    mulsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v2f64:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-NEXT:    vmulpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f64:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX512-NEXT:    vmulpd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)    ret double %1 @@ -566,7 +566,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {  ; SSE-NEXT:    mulpd %xmm2, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    mulpd %xmm1, %xmm0 +; SSE-NEXT:    mulsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v4f64: @@ -574,7 +574,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {  ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0  ; AVX-NEXT:    vmulpd %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -583,7 +583,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {  ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0  ; AVX512-NEXT:    vmulpd %xmm0, %xmm1, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1) @@ -598,7 +598,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {  ; SSE-NEXT:    mulpd %xmm2, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    mulpd %xmm1, %xmm0 +; SSE-NEXT:    mulsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v8f64: @@ -607,7 +607,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -618,7 +618,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1) @@ -637,7 +637,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {  ; SSE-NEXT:    mulpd %xmm1, %xmm4  ; SSE-NEXT:    movapd %xmm4, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE-NEXT:    mulpd %xmm4, %xmm0 +; SSE-NEXT:    mulsd %xmm4, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v16f64: @@ -648,7 +648,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -660,7 +660,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1) @@ -676,20 +676,20 @@ define double @test_v2f64_zero(<2 x double> %a0) {  ; SSE:       # %bb.0:  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    mulpd %xmm0, %xmm1 +; SSE-NEXT:    mulsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v2f64_zero:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f64_zero:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)    ret double %1 @@ -701,7 +701,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {  ; SSE-NEXT:    mulpd %xmm1, %xmm0  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    mulpd %xmm0, %xmm1 +; SSE-NEXT:    mulsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ; @@ -710,7 +710,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -719,7 +719,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0) @@ -734,7 +734,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {  ; SSE-NEXT:    mulpd %xmm1, %xmm0  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    mulpd %xmm0, %xmm1 +; SSE-NEXT:    mulsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ; @@ -744,7 +744,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -755,7 +755,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0) @@ -774,7 +774,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {  ; SSE-NEXT:    mulpd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    mulpd %xmm1, %xmm0 +; SSE-NEXT:    mulsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v16f64_zero: @@ -785,7 +785,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -797,7 +797,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0) @@ -813,20 +813,20 @@ define double @test_v2f64_undef(<2 x double> %a0) {  ; SSE:       # %bb.0:  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    mulpd %xmm0, %xmm1 +; SSE-NEXT:    mulsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v2f64_undef:  ; AVX:       # %bb.0:  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    retq  ;  ; AVX512-LABEL: test_v2f64_undef:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)    ret double %1 @@ -838,7 +838,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {  ; SSE-NEXT:    mulpd %xmm1, %xmm0  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    mulpd %xmm0, %xmm1 +; SSE-NEXT:    mulsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ; @@ -847,7 +847,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -856,7 +856,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0) @@ -871,7 +871,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {  ; SSE-NEXT:    mulpd %xmm1, %xmm0  ; SSE-NEXT:    movapd %xmm0, %xmm1  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT:    mulpd %xmm0, %xmm1 +; SSE-NEXT:    mulsd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    retq  ; @@ -881,7 +881,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -892,7 +892,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0) @@ -911,7 +911,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {  ; SSE-NEXT:    mulpd %xmm0, %xmm1  ; SSE-NEXT:    movapd %xmm1, %xmm0  ; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT:    mulpd %xmm1, %xmm0 +; SSE-NEXT:    mulsd %xmm1, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: test_v16f64_undef: @@ -922,7 +922,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {  ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX-NEXT:    vzeroupper  ; AVX-NEXT:    retq  ; @@ -934,7 +934,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {  ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq    %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)  | 

