diff options
Diffstat (limited to 'llvm')
23 files changed, 518 insertions, 353 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 0121bf9f73d..d9701c2a9e8 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -180,12 +180,6 @@ let TargetPrefix = "x86" in {  // Arithmetic ops  let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.". -  def int_x86_sse_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss">, -              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], -                        [IntrNoMem]>; -  def int_x86_sse_sqrt_ps : GCCBuiltin<"__builtin_ia32_sqrtps">, -              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], -                        [IntrNoMem]>;    def int_x86_sse_rcp_ss : GCCBuiltin<"__builtin_ia32_rcpss">,                Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],                          [IntrNoMem]>; @@ -304,12 +298,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".  // FP arithmetic ops  let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.". -  def int_x86_sse2_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd">, -              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], -                        [IntrNoMem]>; -  def int_x86_sse2_sqrt_pd : GCCBuiltin<"__builtin_ia32_sqrtpd">, -              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], -                        [IntrNoMem]>;    def int_x86_sse2_min_sd : GCCBuiltin<"__builtin_ia32_minsd">,                Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,                           llvm_v2f64_ty], [IntrNoMem]>; @@ -961,11 +949,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,                    llvm_v8f32_ty], [IntrNoMem]>; -  def int_x86_avx_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256">, -        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; -  def int_x86_avx_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256">, -        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; -    def int_x86_avx_rsqrt_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrtps256">,          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; @@ -3868,29 +3851,17 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".            Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss_round_mask">, +  def int_x86_avx512_mask_sqrt_ss :          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,                                      llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd_round_mask">, +  def int_x86_avx512_mask_sqrt_sd :          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,                                      llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_sqrt_pd_128 : GCCBuiltin<"__builtin_ia32_sqrtpd128_mask">, -        Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, -                                    llvm_i8_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256_mask">, -        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, -                                    llvm_i8_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512_mask">, +  def int_x86_avx512_mask_sqrt_pd_512 :          Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,                                      llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_sqrt_ps_128 : GCCBuiltin<"__builtin_ia32_sqrtps128_mask">, -        Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, -                                     llvm_i8_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256_mask">, -        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, -                                     llvm_i8_ty], [IntrNoMem]>; -  def int_x86_avx512_mask_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512_mask">, +  def int_x86_avx512_mask_sqrt_ps_512 :          Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,                                       llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;    def int_x86_avx512_mask_fixupimm_pd_128 : diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index ee8a0c3b12b..7aa2685a14e 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -97,6 +97,15 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {        Name.startswith("avx2.pabs.") || // Added in 6.0        Name.startswith("avx512.mask.pabs.") || // Added in 6.0        Name.startswith("avx512.broadcastm") || // Added in 6.0 +      Name == "sse.sqrt.ss" || // Added in 7.0 +      Name == "sse2.sqrt.sd" || // Added in 7.0 +      Name == "avx512.mask.sqrt.ps.128" || // Added in 7.0 +      Name == "avx512.mask.sqrt.ps.256" || // Added in 7.0 +      Name == "avx512.mask.sqrt.pd.128" || // Added in 7.0 +      Name == "avx512.mask.sqrt.pd.256" || // Added in 7.0 +      Name.startswith("avx.sqrt.p") || // Added in 7.0 +      Name.startswith("sse2.sqrt.p") || // Added in 7.0 +      Name.startswith("sse.sqrt.p") || // Added in 7.0        Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0        Name.startswith("sse2.pcmpeq.") || // Added in 3.1        Name.startswith("sse2.pcmpgt.") || // Added in 3.1 @@ -1475,6 +1484,29 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {                           ExtTy->getPrimitiveSizeInBits();        Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);        Rep = Builder.CreateVectorSplat(NumElts, Rep); +    } else if (IsX86 && (Name == "sse.sqrt.ss" || +                         Name == "sse2.sqrt.sd")) { +      Value *Vec = CI->getArgOperand(0); +      Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0); +      Function *Intr = Intrinsic::getDeclaration(F->getParent(), +                                                 Intrinsic::sqrt, Elt0->getType()); +      Elt0 = Builder.CreateCall(Intr, Elt0); +      Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0); +    } else if (IsX86 && (Name.startswith("avx.sqrt.p") || +                         Name.startswith("sse2.sqrt.p") || +                         Name.startswith("sse.sqrt.p"))) { +      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), +                                                         Intrinsic::sqrt, +                                                         CI->getType()), +                               {CI->getArgOperand(0)}); +    } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.p") && +                         !Name.endswith("512"))) { +        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), +                                                           Intrinsic::sqrt, +                                                           CI->getType()), +                                 {CI->getArgOperand(0)}); +        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, +                            CI->getArgOperand(1));      } else if (IsX86 && (Name.startswith("avx512.ptestm") ||                           Name.startswith("avx512.ptestnm"))) {        Value *Op0 = CI->getArgOperand(0); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index d0ef7cc975f..f921385c328 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -8534,7 +8534,7 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,  }  multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, -                              X86VectorVTInfo _, string Name, Intrinsic Intr> { +                              X86VectorVTInfo _, string Name> {    let ExeDomain = _.ExeDomain in {      defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr, @@ -8575,30 +8575,20 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri      def : Pat<(_.EltVT (fsqrt _.FRC:$src)),                (!cast<Instruction>(Name#Zr)                    (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; - -     def : Pat<(Intr VR128X:$src), -               (!cast<Instruction>(Name#Zr_Int) VR128X:$src, -                                 VR128X:$src)>;    }    let Predicates = [HasAVX512, OptForSize] in {      def : Pat<(_.EltVT (fsqrt (load addr:$src))),                (!cast<Instruction>(Name#Zm)                    (_.EltVT (IMPLICIT_DEF)), addr:$src)>; - -    def : Pat<(Intr _.ScalarIntMemCPat:$src2), -              (!cast<Instruction>(Name#Zm_Int) -                    (_.VT (IMPLICIT_DEF)), addr:$src2)>;    }  }  multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,                                    X86SchedWriteSizes sched> { -  defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS", -                        int_x86_sse_sqrt_ss>, +  defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,                          EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; -  defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD", -                        int_x86_sse2_sqrt_sd>, +  defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,                          EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;  } @@ -8711,6 +8701,13 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,    }  } +defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss, +                            (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info, +                            fp32imm0, (COPY_TO_REGCLASS  $mask, VK1WM), HasAVX512>; +defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd, +                            (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info, +                            fp64imm0, (COPY_TO_REGCLASS  $mask, VK1WM), HasAVX512>; +  multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move,                                      dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP,                                      bits<8> ImmV, dag OutMask, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 13f9a0d5eef..746ff7ada36 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2761,12 +2761,9 @@ defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;  /// For the non-AVX defs, we need $src1 to be tied to $dst because  /// the HW instructions are 2 operand / destructive.  multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, -                          ValueType vt, ValueType ScalarVT, -                          X86MemOperand x86memop, -                          Operand intmemop, ComplexPattern int_cpat, -                          Intrinsic Intr, SDNode OpNode, Domain d, -                          X86FoldableSchedWrite sched, -                          Predicate target> { +                          ValueType ScalarVT, X86MemOperand x86memop, +                          Operand intmemop, SDNode OpNode, Domain d, +                          X86FoldableSchedWrite sched, Predicate target> {    let hasSideEffects = 0 in {    def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),                !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), @@ -2790,6 +2787,11 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,    }    } +} + +multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, +                              ComplexPattern int_cpat, Intrinsic Intr, +                              Predicate target, string Suffix> {    let Predicates = [target] in {    // These are unary operations, but they are modeled as having 2 source operands    // because the high elements of the destination are unchanged in SSE. @@ -2810,11 +2812,23 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,    }  } +multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat, +                              Intrinsic Intr, Predicate target> { +  let Predicates = [target] in { +   def : Pat<(Intr VR128:$src), +             (!cast<Instruction>(NAME#r_Int) VR128:$src, +                                 VR128:$src)>; +  } +  let Predicates = [target, OptForSize] in { +    def : Pat<(Intr int_cpat:$src2), +              (!cast<Instruction>(NAME#m_Int) +                    (vt (IMPLICIT_DEF)), addr:$src2)>; +  } +} +  multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, -                          ValueType vt, ValueType ScalarVT, -                          X86MemOperand x86memop, -                          Operand intmemop, ComplexPattern int_cpat, -                          Intrinsic Intr, SDNode OpNode, Domain d, +                          ValueType ScalarVT, X86MemOperand x86memop, +                          Operand intmemop, SDNode OpNode, Domain d,                            X86FoldableSchedWrite sched, Predicate target> {    let hasSideEffects = 0 in {    def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), @@ -2849,14 +2863,8 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,    let Predicates = [target] in {     def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)                                  (ScalarVT (IMPLICIT_DEF)), RC:$src)>; -   def : Pat<(Intr VR128:$src), -             (!cast<Instruction>(NAME#r_Int) VR128:$src, -                                 VR128:$src)>;    }    let Predicates = [target, OptForSize] in { -    def : Pat<(Intr int_cpat:$src2), -              (!cast<Instruction>(NAME#m_Int) -                    (vt (IMPLICIT_DEF)), addr:$src2)>;      def : Pat<(ScalarVT (OpNode (load addr:$src))),                (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),              addr:$src)>; @@ -2935,29 +2943,32 @@ let Predicates = [HasAVX, NoVLX] in {                  Sched<[sched.XMM.Folded]>;  } +multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, +                          X86SchedWriteWidths sched, Predicate AVXTarget> { +  defm SS        :  sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, +                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), +                      UseSSE1, "SS">, XS; +  defm V#NAME#SS  : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, +                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), +                      AVXTarget>, +                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; +} +  multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,                            X86SchedWriteWidths sched, Predicate AVXTarget> { -  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, -                      ssmem, sse_load_f32, -                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, -                      SSEPackedSingle, sched.Scl, UseSSE1>, XS; -  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, -                      f32mem, ssmem, sse_load_f32, -                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, -                      SSEPackedSingle, sched.Scl, AVXTarget>, XS, VEX_4V, -                      VEX_LIG, VEX_WIG; +  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem, +                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; +  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32, +                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, +                       XS, VEX_4V, VEX_LIG, VEX_WIG;  }  multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,                            X86SchedWriteWidths sched, Predicate AVXTarget> { -  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, -                         sdmem, sse_load_f64, -                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), -                         OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; -  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, -                         f64mem, sdmem, sse_load_f64, -                         !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), -                         OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, +  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem, +                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; +  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64, +                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,                           XD, VEX_4V, VEX_LIG, VEX_WIG;  } @@ -2970,8 +2981,10 @@ defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,  // Reciprocal approximations. Note that these typically require refinement  // in order to obtain suitable precision.  defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, +             sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,               sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;  defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, +             sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,               sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;  // There is no f64 version of the reciprocal approximation instructions. @@ -3009,6 +3022,9 @@ multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNod    }  } +defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; +  multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,                                             SDNode Move, ValueType VT,                                             Predicate BasePredicate> { @@ -3028,10 +3044,6 @@ defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,                                         v4f32, UseSSE1>;  defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,                                         v4f32, UseSSE1>; -defm : scalar_unary_math_intr_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss, -                                       v4f32, UseSSE1>; -defm : scalar_unary_math_intr_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd, -                                       v2f64, UseSSE2>;  //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 78185f7e7ee..65954f01b81 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -318,8 +318,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {    X86_INTRINSIC_DATA(avx_round_pd_256,  ROUNDP, X86ISD::VRNDSCALE, 0),    X86_INTRINSIC_DATA(avx_round_ps_256,  ROUNDP, X86ISD::VRNDSCALE, 0),    X86_INTRINSIC_DATA(avx_rsqrt_ps_256,  INTR_TYPE_1OP, X86ISD::FRSQRT, 0), -  X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0), -  X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),    X86_INTRINSIC_DATA(avx_vpermilvar_pd,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),    X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),    X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), @@ -894,12 +892,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {                       X86ISD::SCALEFS, 0),    X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,                       X86ISD::SCALEFS, 0), -  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), -  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),    X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,                       X86ISD::FSQRT_RND), -  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), -  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),    X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,                       X86ISD::FSQRT_RND),    X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, @@ -1289,7 +1283,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {    X86_INTRINSIC_DATA(sse_movmsk_ps,     INTR_TYPE_1OP, X86ISD::MOVMSK, 0),    X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),    X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0), -  X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),    X86_INTRINSIC_DATA(sse_ucomieq_ss,    COMI, X86ISD::UCOMI, ISD::SETEQ),    X86_INTRINSIC_DATA(sse_ucomige_ss,    COMI, X86ISD::UCOMI, ISD::SETGE),    X86_INTRINSIC_DATA(sse_ucomigt_ss,    COMI, X86ISD::UCOMI, ISD::SETGT), @@ -1345,7 +1338,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {    X86_INTRINSIC_DATA(sse2_psubs_w,      INTR_TYPE_2OP, X86ISD::SUBS, 0),    X86_INTRINSIC_DATA(sse2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),    X86_INTRINSIC_DATA(sse2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0), -  X86_INTRINSIC_DATA(sse2_sqrt_pd,      INTR_TYPE_1OP, ISD::FSQRT, 0),    X86_INTRINSIC_DATA(sse2_ucomieq_sd,   COMI, X86ISD::UCOMI, ISD::SETEQ),    X86_INTRINSIC_DATA(sse2_ucomige_sd,   COMI, X86ISD::UCOMI, ISD::SETGE),    X86_INTRINSIC_DATA(sse2_ucomigt_sd,   COMI, X86ISD::UCOMI, ISD::SETGT), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 7640785c101..8190c342c29 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1293,8 +1293,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,      // Unary scalar-as-vector operations that work column-wise.      case Intrinsic::x86_sse_rcp_ss:      case Intrinsic::x86_sse_rsqrt_ss: -    case Intrinsic::x86_sse_sqrt_ss: -    case Intrinsic::x86_sse2_sqrt_sd:        TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,                                          UndefElts, Depth + 1);        if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 0e445ba3a4e..2cb6449f79a 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -3018,10 +3018,12 @@ define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {  ; X64:       # %bb.0:  ; X64-NEXT:    vsqrtpd %ymm0, %ymm0  ; X64-NEXT:    retq -  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) -  ret <4 x double> %res +entry: +  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2 +  ret <4 x double> %0  } -declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1  define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {  ; X32-LABEL: test_mm256_sqrt_ps: @@ -3033,10 +3035,12 @@ define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {  ; X64:       # %bb.0:  ; X64-NEXT:    vsqrtps %ymm0, %ymm0  ; X64-NEXT:    retq -  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) -  ret <8 x float> %res +entry: +  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2 +  ret <8 x float> %0  } -declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone + +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1  define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {  ; X32-LABEL: test_mm256_store_pd: diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index 21612506f85..2c6571b00d0 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -6,6 +6,36 @@  ; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. +define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) { +; AVX-LABEL: test_x86_avx_sqrt_pd_256: +; AVX:       # %bb.0: +; AVX-NEXT:    vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] +; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256: +; AVX512VL:       # %bb.0: +; AVX512VL-NEXT:    vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0] +; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3] +  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1] +  ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone + +define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) { +; AVX-LABEL: test_x86_avx_sqrt_ps_256: +; AVX:       # %bb.0: +; AVX-NEXT:    vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] +; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256: +; AVX512VL:       # %bb.0: +; AVX512VL-NEXT:    vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0] +; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3] +  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] +  ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone +  define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {  ; AVX-LABEL: test_x86_avx_vinsertf128_pd_256_1:  ; AVX:       # %bb.0: diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll index ace9ae24f27..2fd2b863859 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -622,39 +622,6 @@ define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {  }  declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone - -define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) { -; AVX-LABEL: test_x86_avx_sqrt_pd_256: -; AVX:       # %bb.0: -; AVX-NEXT:    vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] -; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0] -; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3] -  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1] -  ret <4 x double> %res -} -declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone - - -define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) { -; AVX-LABEL: test_x86_avx_sqrt_ps_256: -; AVX:       # %bb.0: -; AVX-NEXT:    vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] -; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0] -; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3] -  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] -  ret <8 x float> %res -} -declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone - -  define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {  ; AVX-LABEL: test_x86_avx_vpermilvar_pd:  ; AVX:       # %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index b119dbcdc9c..b12493a8918 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -6375,6 +6375,182 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #  declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8  declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8 +define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) { +; X32-LABEL: test_mm_mask_sqrt_pd: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_mask_sqrt_pd: +; X64:       # %bb.0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1} +; X64-NEXT:    retq +entry: +  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 +  %1 = bitcast i8 %__U to <8 x i1> +  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> +  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W +  ret <2 x double> %2 +} + +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) + +define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) { +; X32-LABEL: test_mm_maskz_sqrt_pd: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_maskz_sqrt_pd: +; X64:       # %bb.0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 +  %1 = bitcast i8 %__U to <8 x i1> +  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> +  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer +  ret <2 x double> %2 +} + +define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) { +; X32-LABEL: test_mm256_mask_sqrt_pd: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_mask_sqrt_pd: +; X64:       # %bb.0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1} +; X64-NEXT:    retq +entry: +  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 +  %1 = bitcast i8 %__U to <8 x i1> +  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W +  ret <4 x double> %2 +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) { +; X32-LABEL: test_mm256_maskz_sqrt_pd: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_maskz_sqrt_pd: +; X64:       # %bb.0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 +  %1 = bitcast i8 %__U to <8 x i1> +  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer +  ret <4 x double> %2 +} + +define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { +; X32-LABEL: test_mm_mask_sqrt_ps: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vsqrtps %xmm1, %xmm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_mask_sqrt_ps: +; X64:       # %bb.0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vsqrtps %xmm1, %xmm0 {%k1} +; X64-NEXT:    retq +entry: +  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 +  %1 = bitcast i8 %__U to <8 x i1> +  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W +  ret <4 x float> %2 +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) { +; X32-LABEL: test_mm_maskz_sqrt_ps: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm_maskz_sqrt_ps: +; X64:       # %bb.0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 +  %1 = bitcast i8 %__U to <8 x i1> +  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> +  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer +  ret <4 x float> %2 +} + +define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) { +; X32-LABEL: test_mm256_mask_sqrt_ps: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vsqrtps %ymm1, %ymm0 {%k1} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_mask_sqrt_ps: +; X64:       # %bb.0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vsqrtps %ymm1, %ymm0 {%k1} +; X64-NEXT:    retq +entry: +  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 +  %1 = bitcast i8 %__U to <8 x i1> +  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W +  ret <8 x float> %2 +} + +define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) { +; X32-LABEL: test_mm256_maskz_sqrt_ps: +; X32:       # %bb.0: # %entry +; X32-NEXT:    movb {{[0-9]+}}(%esp), %al +; X32-NEXT:    kmovw %eax, %k1 +; X32-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} +; X32-NEXT:    retl +; +; X64-LABEL: test_mm256_maskz_sqrt_ps: +; X64:       # %bb.0: # %entry +; X64-NEXT:    kmovw %edi, %k1 +; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} +; X64-NEXT:    retq +entry: +  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 +  %1 = bitcast i8 %__U to <8 x i1> +  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer +  ret <8 x float> %2 +} + +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) + +declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) +declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)  declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)  declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>)  declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index de021a369ac..a13148599d1 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -12070,3 +12070,40 @@ define <8 x i32> @test_expand_load_d_256(i8* %addr, <8 x i32> %data) {    %res = call <8 x i32> @llvm.x86.avx512.mask.expand.load.d.256(i8* %addr, <8 x i32> %data, i8 -1)    ret <8 x i32> %res  } + +define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) { +; X86-LABEL: test_sqrt_pd_256: +; X86:       # %bb.0: +; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0] +; X86-NEXT:    retl # encoding: [0xc3] +; +; X64-LABEL: test_sqrt_pd_256: +; X64:       # %bb.0: +; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0] +; X64-NEXT:    retq # encoding: [0xc3] +  %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 %mask) +  ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone + +define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) { +; X86-LABEL: test_sqrt_ps_256: +; X86:       # %bb.0: +; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0] +; X86-NEXT:    retl # encoding: [0xc3] +; +; X64-LABEL: test_sqrt_ps_256: +; X64:       # %bb.0: +; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0] +; X64-NEXT:    retq # encoding: [0xc3] +  %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) +  ret <8 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 19de714ce8f..6b4b68fb4c1 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -999,43 +999,6 @@ define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %  }  declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) -define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) { -; X86-LABEL: test_sqrt_pd_256: -; X86:       # %bb.0: -; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0] -; X86-NEXT:    retl # encoding: [0xc3] -; -; X64-LABEL: test_sqrt_pd_256: -; X64:       # %bb.0: -; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0] -; X64-NEXT:    retq # encoding: [0xc3] -  %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 %mask) -  ret <4 x double> %res -} -declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone - -define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) { -; X86-LABEL: test_sqrt_ps_256: -; X86:       # %bb.0: -; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0] -; X86-NEXT:    retl # encoding: [0xc3] -; -; X64-LABEL: test_sqrt_ps_256: -; X64:       # %bb.0: -; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0] -; X64-NEXT:    retq # encoding: [0xc3] -  %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) -  ret <8 x float> %res -} - -declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone -  define <4 x double> @test_getexp_pd_256(<4 x double> %a0) {  ; CHECK-LABEL: test_getexp_pd_256:  ; CHECK:       # %bb.0: diff --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll index 7feb66525e2..c77c6adf2e8 100644 --- a/llvm/test/CodeGen/X86/fold-load-unops.ll +++ b/llvm/test/CodeGen/X86/fold-load-unops.ll @@ -165,12 +165,14 @@ define float @sqrtss_size(float* %a) optsize{  define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{  ; SSE-LABEL: sqrtss_full_size:  ; SSE:       # %bb.0: -; SSE-NEXT:    sqrtss (%rdi), %xmm0 +; SSE-NEXT:    movaps (%rdi), %xmm0 +; SSE-NEXT:    sqrtss %xmm0, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: sqrtss_full_size:  ; AVX:       # %bb.0: -; AVX-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT:    vmovaps (%rdi), %xmm0 +; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0  ; AVX-NEXT:    retq      %ld = load <4 x float>, <4 x float>* %a      %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) @@ -197,12 +199,14 @@ define double @sqrtsd_size(double* %a) optsize {  define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize {  ; SSE-LABEL: sqrtsd_full_size:  ; SSE:       # %bb.0: -; SSE-NEXT:    sqrtsd (%rdi), %xmm0 +; SSE-NEXT:    movapd (%rdi), %xmm0 +; SSE-NEXT:    sqrtsd %xmm0, %xmm0  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: sqrtsd_full_size:  ; AVX:       # %bb.0: -; AVX-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT:    vmovapd (%rdi), %xmm0 +; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0  ; AVX-NEXT:    retq      %ld = load <2 x double>, <2 x double>* %a      %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 16dedb74366..a660293d6f7 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -2075,10 +2075,10 @@ define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {  ; AVX:       # %bb.0:  ; AVX-NEXT:    vsqrtps %xmm0, %xmm0  ; AVX-NEXT:    ret{{[l|q]}} -  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) +  %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)    ret <4 x float> %res  } -declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone  define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {  ; SSE-LABEL: test_mm_sqrt_ss: @@ -2090,10 +2090,12 @@ define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {  ; AVX:       # %bb.0:  ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0  ; AVX-NEXT:    ret{{[l|q]}} -  %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) -  ret <4 x float> %sqrt +  %ext = extractelement <4 x float> %a0, i32 0 +  %sqrt = call float @llvm.sqrt.f32(float %ext) +  %ins = insertelement <4 x float> %a0, float %sqrt, i32 0 +  ret <4 x float> %ins  } -declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone +declare float @llvm.sqrt.f32(float) nounwind readnone  define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {  ; X86-SSE-LABEL: test_mm_store_ps: diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll index 60a455ae148..d153d2f42d6 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll @@ -6,6 +6,44 @@  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1  ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512 + +define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) { +; SSE-LABEL: test_x86_sse_sqrt_ps: +; SSE:       ## %bb.0: +; SSE-NEXT:    sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0] +; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse_sqrt_ps: +; AVX1:       ## %bb.0: +; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] +; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse_sqrt_ps: +; AVX512:       ## %bb.0: +; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0] +; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] +  ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + + +define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) { +; SSE-LABEL: test_x86_sse_sqrt_ss: +; SSE:       ## %bb.0: +; SSE-NEXT:    sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0] +; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX-LABEL: test_x86_sse_sqrt_ss: +; AVX:       ## %bb.0: +; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0] +; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +  %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] +  ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone + +  define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {  ; X86-SSE-LABEL: test_x86_sse_storeu_ps:  ; X86-SSE:       ## %bb.0: diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll index 0014da6b2ec..63ccda1d66b 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-x86.ll @@ -448,48 +448,6 @@ define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {  declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone -define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) { -; SSE-LABEL: test_x86_sse_sqrt_ps: -; SSE:       ## %bb.0: -; SSE-NEXT:    sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0] -; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse_sqrt_ps: -; AVX1:       ## %bb.0: -; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] -; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse_sqrt_ps: -; AVX512:       ## %bb.0: -; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0] -; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] -  ret <4 x float> %res -} -declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone - - -define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) { -; SSE-LABEL: test_x86_sse_sqrt_ss: -; SSE:       ## %bb.0: -; SSE-NEXT:    sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0] -; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse_sqrt_ss: -; AVX1:       ## %bb.0: -; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0] -; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse_sqrt_ss: -; AVX512:       ## %bb.0: -; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0] -; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -  %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] -  ret <4 x float> %res -} -declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone - -  define void @test_x86_sse_stmxcsr(i8* %a0) {  ; X86-SSE-LABEL: test_x86_sse_stmxcsr:  ; X86-SSE:       ## %bb.0: diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll index 1a294daf1ea..39bbb8f6538 100644 --- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -83,26 +83,22 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {  define <4 x float> @test_sqrt_ss(<4 x float> %a) {  ; SSE2-LABEL: test_sqrt_ss:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    sqrtss %xmm0, %xmm1 -; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE2-NEXT:    sqrtss %xmm0, %xmm0  ; SSE2-NEXT:    ret{{[l|q]}}  ;  ; SSE41-LABEL: test_sqrt_ss:  ; SSE41:       # %bb.0: -; SSE41-NEXT:    sqrtss %xmm0, %xmm1 -; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE41-NEXT:    sqrtss %xmm0, %xmm0  ; SSE41-NEXT:    ret{{[l|q]}}  ;  ; AVX1-LABEL: test_sqrt_ss:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm1 -; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0  ; AVX1-NEXT:    ret{{[l|q]}}  ;  ; AVX512-LABEL: test_sqrt_ss:  ; AVX512:       # %bb.0: -; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0  ; AVX512-NEXT:    ret{{[l|q]}}    %1 = extractelement <4 x float> %a, i32 0    %2 = call float @llvm.sqrt.f32(float %1) @@ -182,26 +178,22 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {  define <2 x double> @test_sqrt_sd(<2 x double> %a) {  ; SSE2-LABEL: test_sqrt_sd:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    sqrtsd %xmm0, %xmm1 -; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT:    sqrtsd %xmm0, %xmm0  ; SSE2-NEXT:    ret{{[l|q]}}  ;  ; SSE41-LABEL: test_sqrt_sd:  ; SSE41:       # %bb.0: -; SSE41-NEXT:    sqrtsd %xmm0, %xmm1 -; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT:    sqrtsd %xmm0, %xmm0  ; SSE41-NEXT:    ret{{[l|q]}}  ;  ; AVX1-LABEL: test_sqrt_sd:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0  ; AVX1-NEXT:    ret{{[l|q]}}  ;  ; AVX512-LABEL: test_sqrt_sd:  ; AVX512:       # %bb.0: -; AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1 -; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0  ; AVX512-NEXT:    ret{{[l|q]}}    %1 = extractelement <2 x double> %a, i32 0    %2 = call double @llvm.sqrt.f64(double %1) diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index abc45b02d0d..31046e94c34 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -3720,10 +3720,10 @@ define <2 x double> @test_mm_sqrt_pd(<2 x double> %a0) nounwind {  ; AVX:       # %bb.0:  ; AVX-NEXT:    vsqrtpd %xmm0, %xmm0  ; AVX-NEXT:    ret{{[l|q]}} -  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) +  %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0)    ret <2 x double> %res  } -declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) nounwind readnone  define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {  ; SSE-LABEL: test_mm_sqrt_sd: @@ -3736,14 +3736,12 @@ define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwin  ; AVX:       # %bb.0:  ; AVX-NEXT:    vsqrtsd %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    ret{{[l|q]}} -  %call = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) -  %ext0 = extractelement <2 x double> %call, i32 0 -  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 -  %ext1 = extractelement <2 x double> %a1, i32 1 -  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1 -  ret <2 x double> %ins1 +  %ext = extractelement <2 x double> %a0, i32 0 +  %sqrt = call double @llvm.sqrt.f64(double %ext) +  %ins = insertelement <2 x double> %a1, double %sqrt, i32 0 +  ret <2 x double> %ins  } -declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone  define <2 x i64> @test_mm_sra_epi16(<2 x i64> %a0, <2 x i64> %a1) {  ; SSE-LABEL: test_mm_sra_epi16: diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index 91b19fc4fc4..f29b474ea0b 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -6,6 +6,89 @@  ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1  ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512 + +define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) { +; SSE-LABEL: test_x86_sse2_sqrt_pd: +; SSE:       ## %bb.0: +; SSE-NEXT:    sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0] +; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse2_sqrt_pd: +; AVX1:       ## %bb.0: +; AVX1-NEXT:    vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0] +; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse2_sqrt_pd: +; AVX512:       ## %bb.0: +; AVX512-NEXT:    vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0] +; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1] +  ret <2 x double> %res +} +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + + +define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) { +; SSE-LABEL: test_x86_sse2_sqrt_sd: +; SSE:       ## %bb.0: +; SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0] +; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX-LABEL: test_x86_sse2_sqrt_sd: +; AVX:       ## %bb.0: +; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] +; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] +  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1] +  ret <2 x double> %res +} +declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone + + +define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) { +; X86-SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load: +; X86-SSE:       ## %bb.0: +; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-SSE-NEXT:    movapd (%eax), %xmm0 ## encoding: [0x66,0x0f,0x28,0x00] +; X86-SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0] +; X86-SSE-NEXT:    retl ## encoding: [0xc3] +; +; X86-AVX1-LABEL: test_x86_sse2_sqrt_sd_vec_load: +; X86-AVX1:       ## %bb.0: +; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX1-NEXT:    vmovapd (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x00] +; X86-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] +; X86-AVX1-NEXT:    retl ## encoding: [0xc3] +; +; X86-AVX512-LABEL: test_x86_sse2_sqrt_sd_vec_load: +; X86-AVX512:       ## %bb.0: +; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX512-NEXT:    vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00] +; X86-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] +; X86-AVX512-NEXT:    retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load: +; X64-SSE:       ## %bb.0: +; X64-SSE-NEXT:    movapd (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x28,0x07] +; X64-SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0] +; X64-SSE-NEXT:    retq ## encoding: [0xc3] +; +; X64-AVX1-LABEL: test_x86_sse2_sqrt_sd_vec_load: +; X64-AVX1:       ## %bb.0: +; X64-AVX1-NEXT:    vmovapd (%rdi), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x07] +; X64-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] +; X64-AVX1-NEXT:    retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: test_x86_sse2_sqrt_sd_vec_load: +; X64-AVX512:       ## %bb.0: +; X64-AVX512-NEXT:    vmovapd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x07] +; X64-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] +; X64-AVX512-NEXT:    retq ## encoding: [0xc3] +  %a1 = load <2 x double>, <2 x double>* %a0, align 16 +  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1] +  ret <2 x double> %res +} + +  define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {  ; SSE-LABEL: test_x86_sse2_psll_dq_bs:  ; SSE:       ## %bb.0: @@ -241,8 +324,8 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {  ; X86-SSE:       ## %bb.0:  ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]  ; X86-SSE-NEXT:    xorpd %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x57,0xc9] -; X86-SSE-NEXT:    movhpd LCPI8_0, %xmm1 ## encoding: [0x66,0x0f,0x16,0x0d,A,A,A,A] -; X86-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4 +; X86-SSE-NEXT:    movhpd LCPI11_0, %xmm1 ## encoding: [0x66,0x0f,0x16,0x0d,A,A,A,A] +; X86-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4  ; X86-SSE-NEXT:    ## xmm1 = xmm1[0],mem[0]  ; X86-SSE-NEXT:    addpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x58,0xc8]  ; X86-SSE-NEXT:    movupd %xmm1, (%eax) ## encoding: [0x66,0x0f,0x11,0x08] @@ -252,8 +335,8 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {  ; X86-AVX1:       ## %bb.0:  ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]  ; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x57,0xc9] -; X86-AVX1-NEXT:    vmovhpd LCPI8_0, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A] -; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4 +; X86-AVX1-NEXT:    vmovhpd LCPI11_0, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A] +; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4  ; X86-AVX1-NEXT:    ## xmm1 = xmm1[0],mem[0]  ; X86-AVX1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]  ; X86-AVX1-NEXT:    vmovupd %xmm0, (%eax) ## encoding: [0xc5,0xf9,0x11,0x00] @@ -262,8 +345,8 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {  ; X86-AVX512-LABEL: test_x86_sse2_storeu_pd:  ; X86-AVX512:       ## %bb.0:  ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT:    vmovsd LCPI8_0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A] -; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI8_0, kind: FK_Data_4 +; X86-AVX512-NEXT:    vmovsd LCPI11_0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A] +; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI11_0, kind: FK_Data_4  ; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero  ; X86-AVX512-NEXT:    vpslldq $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xf9,0x08]  ; X86-AVX512-NEXT:    ## xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] @@ -275,7 +358,7 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {  ; X64-SSE:       ## %bb.0:  ; X64-SSE-NEXT:    xorpd %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x57,0xc9]  ; X64-SSE-NEXT:    movhpd {{.*}}(%rip), %xmm1 ## encoding: [0x66,0x0f,0x16,0x0d,A,A,A,A] -; X64-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI8_0-4, kind: reloc_riprel_4byte +; X64-SSE-NEXT:    ## fixup A - offset: 4, value: LCPI11_0-4, kind: reloc_riprel_4byte  ; X64-SSE-NEXT:    ## xmm1 = xmm1[0],mem[0]  ; X64-SSE-NEXT:    addpd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x58,0xc8]  ; X64-SSE-NEXT:    movupd %xmm1, (%rdi) ## encoding: [0x66,0x0f,0x11,0x0f] @@ -285,7 +368,7 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {  ; X64-AVX1:       ## %bb.0:  ; X64-AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x57,0xc9]  ; X64-AVX1-NEXT:    vmovhpd {{.*}}(%rip), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x0d,A,A,A,A] -; X64-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI8_0-4, kind: reloc_riprel_4byte +; X64-AVX1-NEXT:    ## fixup A - offset: 4, value: LCPI11_0-4, kind: reloc_riprel_4byte  ; X64-AVX1-NEXT:    ## xmm1 = xmm1[0],mem[0]  ; X64-AVX1-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x58,0xc1]  ; X64-AVX1-NEXT:    vmovupd %xmm0, (%rdi) ## encoding: [0xc5,0xf9,0x11,0x07] @@ -294,7 +377,7 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {  ; X64-AVX512-LABEL: test_x86_sse2_storeu_pd:  ; X64-AVX512:       ## %bb.0:  ; X64-AVX512-NEXT:    vmovsd {{.*}}(%rip), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0d,A,A,A,A] -; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI8_0-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI11_0-4, kind: reloc_riprel_4byte  ; X64-AVX512-NEXT:    ## xmm1 = mem[0],zero  ; X64-AVX512-NEXT:    vpslldq $8, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x73,0xf9,0x08]  ; X64-AVX512-NEXT:    ## xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll index 337e3ac4ab6..5e7b351ad13 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1607,93 +1607,6 @@ define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {  declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone -define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) { -; SSE-LABEL: test_x86_sse2_sqrt_pd: -; SSE:       ## %bb.0: -; SSE-NEXT:    sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0] -; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_sqrt_pd: -; AVX1:       ## %bb.0: -; AVX1-NEXT:    vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0] -; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_sqrt_pd: -; AVX512:       ## %bb.0: -; AVX512-NEXT:    vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0] -; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1] -  ret <2 x double> %res -} -declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone - - -define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) { -; SSE-LABEL: test_x86_sse2_sqrt_sd: -; SSE:       ## %bb.0: -; SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0] -; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_sqrt_sd: -; AVX1:       ## %bb.0: -; AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] -; AVX1-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_sqrt_sd: -; AVX512:       ## %bb.0: -; AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0] -; AVX512-NEXT:    ret{{[l|q]}} ## encoding: [0xc3] -  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1] -  ret <2 x double> %res -} -declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone - - -define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) { -; X86-SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load: -; X86-SSE:       ## %bb.0: -; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT:    movapd (%eax), %xmm0 ## encoding: [0x66,0x0f,0x28,0x00] -; X86-SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0] -; X86-SSE-NEXT:    retl ## encoding: [0xc3] -; -; X86-AVX1-LABEL: test_x86_sse2_sqrt_sd_vec_load: -; X86-AVX1:       ## %bb.0: -; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT:    vmovapd (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x00] -; X86-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] -; X86-AVX1-NEXT:    retl ## encoding: [0xc3] -; -; X86-AVX512-LABEL: test_x86_sse2_sqrt_sd_vec_load: -; X86-AVX512:       ## %bb.0: -; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT:    vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00] -; X86-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0] -; X86-AVX512-NEXT:    retl ## encoding: [0xc3] -; -; X64-SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load: -; X64-SSE:       ## %bb.0: -; X64-SSE-NEXT:    movapd (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x28,0x07] -; X64-SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0] -; X64-SSE-NEXT:    retq ## encoding: [0xc3] -; -; X64-AVX1-LABEL: test_x86_sse2_sqrt_sd_vec_load: -; X64-AVX1:       ## %bb.0: -; X64-AVX1-NEXT:    vmovapd (%rdi), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x07] -; X64-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0] -; X64-AVX1-NEXT:    retq ## encoding: [0xc3] -; -; X64-AVX512-LABEL: test_x86_sse2_sqrt_sd_vec_load: -; X64-AVX512:       ## %bb.0: -; X64-AVX512-NEXT:    vmovapd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x07] -; X64-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0] -; X64-AVX512-NEXT:    retq ## encoding: [0xc3] -  %a1 = load <2 x double>, <2 x double>* %a0, align 16 -  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1] -  ret <2 x double> %res -} - -  define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {  ; SSE-LABEL: test_x86_sse2_ucomieq_sd:  ; SSE:       ## %bb.0: diff --git a/llvm/test/CodeGen/X86/sse_partial_update.ll b/llvm/test/CodeGen/X86/sse_partial_update.ll index d6929930844..db575d62380 100644 --- a/llvm/test/CodeGen/X86/sse_partial_update.ll +++ b/llvm/test/CodeGen/X86/sse_partial_update.ll @@ -54,9 +54,10 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone  define void @sqrtss(<4 x float> %a) nounwind uwtable ssp {  ; CHECK-LABEL: sqrtss:  ; CHECK:       ## %bb.0: ## %entry -; CHECK-NEXT:    sqrtss %xmm0, %xmm0 -; CHECK-NEXT:    cvtss2sd %xmm0, %xmm2 +; CHECK-NEXT:    sqrtss %xmm0, %xmm1 +; CHECK-NEXT:    cvtss2sd %xmm1, %xmm2  ; CHECK-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT:    xorps %xmm1, %xmm1  ; CHECK-NEXT:    cvtss2sd %xmm0, %xmm1  ; CHECK-NEXT:    movaps %xmm2, %xmm0  ; CHECK-NEXT:    jmp _callee ## TAILCALL @@ -75,9 +76,10 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone  define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp {  ; CHECK-LABEL: sqrtsd:  ; CHECK:       ## %bb.0: ## %entry -; CHECK-NEXT:    sqrtsd %xmm0, %xmm0 -; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm2 +; CHECK-NEXT:    sqrtsd %xmm0, %xmm1 +; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm2  ; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT:    xorps %xmm1, %xmm1  ; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm1  ; CHECK-NEXT:    movaps %xmm2, %xmm0  ; CHECK-NEXT:    jmp _callee2 ## TAILCALL diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse.ll index 6ed62a4e022..830782b3b20 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-sse.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse.ll @@ -33,10 +33,8 @@ define float @test_rcp_ss_1(float %a) {  define float @test_sqrt_ss_0(float %a) {  ; CHECK-LABEL: @test_sqrt_ss_0( -; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> undef, float %a, i32 0 -; CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> [[TMP1]]) -; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT:    ret float [[TMP3]] +; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.sqrt.f32(float %a) +; CHECK-NEXT:    ret float [[TMP1]]  ;    %1 = insertelement <4 x float> undef, float %a, i32 0    %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1 diff --git a/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll b/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll index fe8828bfb5b..721097e016f 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-sse2.ll @@ -4,10 +4,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"  define double @test_sqrt_sd_0(double %a) {  ; CHECK-LABEL: @test_sqrt_sd_0( -; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %a, i32 0 -; CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> [[TMP1]]) -; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 -; CHECK-NEXT:    ret double [[TMP3]] +; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.sqrt.f64(double %a) +; CHECK-NEXT:    ret double [[TMP1]]  ;    %1 = insertelement <2 x double> undef, double %a, i32 0    %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1  | 

