diff options
| author | Tomasz Krupa <tomasz.krupa@intel.com> | 2018-06-15 18:05:24 +0000 |
|---|---|---|
| committer | Tomasz Krupa <tomasz.krupa@intel.com> | 2018-06-15 18:05:24 +0000 |
| commit | bcaab53d479e7005ee69e06321bbb493f9b7f5e6 (patch) | |
| tree | f2fa28deb5a5751a5e98621dd79b7ec20a492d48 /llvm/lib | |
| parent | 1657b7b8d2036abb6d18f9aeb31497659be4f761 (diff) | |
| download | bcm5719-llvm-bcaab53d479e7005ee69e06321bbb493f9b7f5e6.tar.gz bcm5719-llvm-bcaab53d479e7005ee69e06321bbb493f9b7f5e6.zip | |
[X86] Lowering sqrt intrinsics to native IR
Summary: Complementary patch to lowering sqrt intrinsics in Clang.
Reviewers: craig.topper, spatel, RKSimon, DavidKreitzer, uriel.k
Reviewed By: craig.topper
Subscribers: tkrupa, mike.dvoretsky, llvm-commits
Differential Revision: https://reviews.llvm.org/D41599
llvm-svn: 334849
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 32 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 23 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 86 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 8 | ||||
| -rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 2 |
5 files changed, 91 insertions, 60 deletions
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index ee8a0c3b12b..7aa2685a14e 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -97,6 +97,15 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx2.pabs.") || // Added in 6.0 Name.startswith("avx512.mask.pabs.") || // Added in 6.0 Name.startswith("avx512.broadcastm") || // Added in 6.0 + Name == "sse.sqrt.ss" || // Added in 7.0 + Name == "sse2.sqrt.sd" || // Added in 7.0 + Name == "avx512.mask.sqrt.ps.128" || // Added in 7.0 + Name == "avx512.mask.sqrt.ps.256" || // Added in 7.0 + Name == "avx512.mask.sqrt.pd.128" || // Added in 7.0 + Name == "avx512.mask.sqrt.pd.256" || // Added in 7.0 + Name.startswith("avx.sqrt.p") || // Added in 7.0 + Name.startswith("sse2.sqrt.p") || // Added in 7.0 + Name.startswith("sse.sqrt.p") || // Added in 7.0 Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0 Name.startswith("sse2.pcmpeq.") || // Added in 3.1 Name.startswith("sse2.pcmpgt.") || // Added in 3.1 @@ -1475,6 +1484,29 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { ExtTy->getPrimitiveSizeInBits(); Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy); Rep = Builder.CreateVectorSplat(NumElts, Rep); + } else if (IsX86 && (Name == "sse.sqrt.ss" || + Name == "sse2.sqrt.sd")) { + Value *Vec = CI->getArgOperand(0); + Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0); + Function *Intr = Intrinsic::getDeclaration(F->getParent(), + Intrinsic::sqrt, Elt0->getType()); + Elt0 = Builder.CreateCall(Intr, Elt0); + Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0); + } else if (IsX86 && (Name.startswith("avx.sqrt.p") || + Name.startswith("sse2.sqrt.p") || + Name.startswith("sse.sqrt.p"))) { + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), + Intrinsic::sqrt, + CI->getType()), + {CI->getArgOperand(0)}); + } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.p") && + !Name.endswith("512"))) { + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), + Intrinsic::sqrt, + CI->getType()), + {CI->getArgOperand(0)}); + Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, + CI->getArgOperand(1)); } else if (IsX86 && (Name.startswith("avx512.ptestm") || Name.startswith("avx512.ptestnm"))) { Value *Op0 = CI->getArgOperand(0); diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index d0ef7cc975f..f921385c328 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -8534,7 +8534,7 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, } multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, - X86VectorVTInfo _, string Name, Intrinsic Intr> { + X86VectorVTInfo _, string Name> { let ExeDomain = _.ExeDomain in { defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, @@ -8575,30 +8575,20 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri def : Pat<(_.EltVT (fsqrt _.FRC:$src)), (!cast<Instruction>(Name#Zr) (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; - - def : Pat<(Intr VR128X:$src), - (!cast<Instruction>(Name#Zr_Int) VR128X:$src, - VR128X:$src)>; } let Predicates = [HasAVX512, OptForSize] in { def : Pat<(_.EltVT (fsqrt (load addr:$src))), (!cast<Instruction>(Name#Zm) (_.EltVT (IMPLICIT_DEF)), addr:$src)>; - - def : Pat<(Intr _.ScalarIntMemCPat:$src2), - (!cast<Instruction>(Name#Zm_Int) - (_.VT (IMPLICIT_DEF)), addr:$src2)>; } } multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr, X86SchedWriteSizes sched> { - defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS", - int_x86_sse_sqrt_ss>, + defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; - defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD", - int_x86_sse2_sqrt_sd>, + defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W; } @@ -8711,6 +8701,13 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move, } } +defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss, + (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info, + fp32imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>; +defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd, + (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info, + fp64imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>; + multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move, dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP, bits<8> ImmV, dag OutMask, diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 13f9a0d5eef..746ff7ada36 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2761,12 +2761,9 @@ defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; /// For the non-AVX defs, we need $src1 to be tied to $dst because /// the HW instructions are 2 operand / destructive. multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, - ValueType vt, ValueType ScalarVT, - X86MemOperand x86memop, - Operand intmemop, ComplexPattern int_cpat, - Intrinsic Intr, SDNode OpNode, Domain d, - X86FoldableSchedWrite sched, - Predicate target> { + ValueType ScalarVT, X86MemOperand x86memop, + Operand intmemop, SDNode OpNode, Domain d, + X86FoldableSchedWrite sched, Predicate target> { let hasSideEffects = 0 in { def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), @@ -2790,6 +2787,11 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, } } +} + +multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, + ComplexPattern int_cpat, Intrinsic Intr, + Predicate target, string Suffix> { let Predicates = [target] in { // These are unary operations, but they are modeled as having 2 source operands // because the high elements of the destination are unchanged in SSE. @@ -2810,11 +2812,23 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, } } +multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat, + Intrinsic Intr, Predicate target> { + let Predicates = [target] in { + def : Pat<(Intr VR128:$src), + (!cast<Instruction>(NAME#r_Int) VR128:$src, + VR128:$src)>; + } + let Predicates = [target, OptForSize] in { + def : Pat<(Intr int_cpat:$src2), + (!cast<Instruction>(NAME#m_Int) + (vt (IMPLICIT_DEF)), addr:$src2)>; + } +} + multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, - ValueType vt, ValueType ScalarVT, - X86MemOperand x86memop, - Operand intmemop, ComplexPattern int_cpat, - Intrinsic Intr, SDNode OpNode, Domain d, + ValueType ScalarVT, X86MemOperand x86memop, + Operand intmemop, SDNode OpNode, Domain d, X86FoldableSchedWrite sched, Predicate target> { let hasSideEffects = 0 in { def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), @@ -2849,14 +2863,8 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, let Predicates = [target] in { def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; - def : Pat<(Intr VR128:$src), - (!cast<Instruction>(NAME#r_Int) VR128:$src, - VR128:$src)>; } let Predicates = [target, OptForSize] in { - def : Pat<(Intr int_cpat:$src2), - (!cast<Instruction>(NAME#m_Int) - (vt (IMPLICIT_DEF)), addr:$src2)>; def : Pat<(ScalarVT (OpNode (load addr:$src))), (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), addr:$src)>; @@ -2935,29 +2943,32 @@ let Predicates = [HasAVX, NoVLX] in { Sched<[sched.XMM.Folded]>; } +multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, Predicate AVXTarget> { + defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), + UseSSE1, "SS">, XS; + defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), + AVXTarget>, + XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; +} + multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched, Predicate AVXTarget> { - defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, - ssmem, sse_load_f32, - !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, - SSEPackedSingle, sched.Scl, UseSSE1>, XS; - defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, - f32mem, ssmem, sse_load_f32, - !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, - SSEPackedSingle, sched.Scl, AVXTarget>, XS, VEX_4V, - VEX_LIG, VEX_WIG; + defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem, + ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; + defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32, + f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, + XS, VEX_4V, VEX_LIG, VEX_WIG; } multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched, Predicate AVXTarget> { - defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, - sdmem, sse_load_f64, - !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; - defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, - f64mem, sdmem, sse_load_f64, - !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, + defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem, + sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; + defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64, + f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, XD, VEX_4V, VEX_LIG, VEX_WIG; } @@ -2970,8 +2981,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>, // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, + sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, + sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; // There is no f64 version of the reciprocal approximation instructions. @@ -3009,6 +3022,9 @@ multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNod } } +defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; + multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, SDNode Move, ValueType VT, Predicate BasePredicate> { @@ -3028,10 +3044,6 @@ defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, v4f32, UseSSE1>; defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, v4f32, UseSSE1>; -defm : scalar_unary_math_intr_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss, - v4f32, UseSSE1>; -defm : scalar_unary_math_intr_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd, - v2f64, UseSSE2>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 78185f7e7ee..65954f01b81 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -318,8 +318,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx_round_pd_256, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx_round_ps_256, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), @@ -894,12 +892,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::SCALEFS, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::SCALEFS, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, X86ISD::FSQRT_RND), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, @@ -1289,7 +1283,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0), X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE), X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT), @@ -1345,7 +1338,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 7640785c101..8190c342c29 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1293,8 +1293,6 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // Unary scalar-as-vector operations that work column-wise. case Intrinsic::x86_sse_rcp_ss: case Intrinsic::x86_sse_rsqrt_ss: - case Intrinsic::x86_sse_sqrt_ss: - case Intrinsic::x86_sse2_sqrt_sd: TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, UndefElts, Depth + 1); if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } |

