summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
authorTomasz Krupa <tomasz.krupa@intel.com>2018-06-15 18:05:24 +0000
committerTomasz Krupa <tomasz.krupa@intel.com>2018-06-15 18:05:24 +0000
commitbcaab53d479e7005ee69e06321bbb493f9b7f5e6 (patch)
treef2fa28deb5a5751a5e98621dd79b7ec20a492d48 /llvm/lib/Target/X86
parent1657b7b8d2036abb6d18f9aeb31497659be4f761 (diff)
downloadbcm5719-llvm-bcaab53d479e7005ee69e06321bbb493f9b7f5e6.tar.gz
bcm5719-llvm-bcaab53d479e7005ee69e06321bbb493f9b7f5e6.zip
[X86] Lowering sqrt intrinsics to native IR
Summary: Complementary patch to lowering sqrt intrinsics in Clang. Reviewers: craig.topper, spatel, RKSimon, DavidKreitzer, uriel.k Reviewed By: craig.topper Subscribers: tkrupa, mike.dvoretsky, llvm-commits Differential Revision: https://reviews.llvm.org/D41599 llvm-svn: 334849
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td23
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td86
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h8
3 files changed, 59 insertions, 58 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index d0ef7cc975f..f921385c328 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -8534,7 +8534,7 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
}
multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
- X86VectorVTInfo _, string Name, Intrinsic Intr> {
+ X86VectorVTInfo _, string Name> {
let ExeDomain = _.ExeDomain in {
defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -8575,30 +8575,20 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
def : Pat<(_.EltVT (fsqrt _.FRC:$src)),
(!cast<Instruction>(Name#Zr)
(_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
-
- def : Pat<(Intr VR128X:$src),
- (!cast<Instruction>(Name#Zr_Int) VR128X:$src,
- VR128X:$src)>;
}
let Predicates = [HasAVX512, OptForSize] in {
def : Pat<(_.EltVT (fsqrt (load addr:$src))),
(!cast<Instruction>(Name#Zm)
(_.EltVT (IMPLICIT_DEF)), addr:$src)>;
-
- def : Pat<(Intr _.ScalarIntMemCPat:$src2),
- (!cast<Instruction>(Name#Zm_Int)
- (_.VT (IMPLICIT_DEF)), addr:$src2)>;
}
}
multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched> {
- defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS",
- int_x86_sse_sqrt_ss>,
+ defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
- defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD",
- int_x86_sse2_sqrt_sd>,
+ defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
}
@@ -8711,6 +8701,13 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
}
}
+defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss,
+ (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info,
+ fp32imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
+defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd,
+ (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info,
+ fp64imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>;
+
multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move,
dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP,
bits<8> ImmV, dag OutMask,
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 13f9a0d5eef..746ff7ada36 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -2761,12 +2761,9 @@ defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
/// For the non-AVX defs, we need $src1 to be tied to $dst because
/// the HW instructions are 2 operand / destructive.
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
- ValueType vt, ValueType ScalarVT,
- X86MemOperand x86memop,
- Operand intmemop, ComplexPattern int_cpat,
- Intrinsic Intr, SDNode OpNode, Domain d,
- X86FoldableSchedWrite sched,
- Predicate target> {
+ ValueType ScalarVT, X86MemOperand x86memop,
+ Operand intmemop, SDNode OpNode, Domain d,
+ X86FoldableSchedWrite sched, Predicate target> {
let hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
!strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
@@ -2790,6 +2787,11 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
}
}
+}
+
+multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
+ ComplexPattern int_cpat, Intrinsic Intr,
+ Predicate target, string Suffix> {
let Predicates = [target] in {
// These are unary operations, but they are modeled as having 2 source operands
// because the high elements of the destination are unchanged in SSE.
@@ -2810,11 +2812,23 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
}
}
+multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
+ Intrinsic Intr, Predicate target> {
+ let Predicates = [target] in {
+ def : Pat<(Intr VR128:$src),
+ (!cast<Instruction>(NAME#r_Int) VR128:$src,
+ VR128:$src)>;
+ }
+ let Predicates = [target, OptForSize] in {
+ def : Pat<(Intr int_cpat:$src2),
+ (!cast<Instruction>(NAME#m_Int)
+ (vt (IMPLICIT_DEF)), addr:$src2)>;
+ }
+}
+
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
- ValueType vt, ValueType ScalarVT,
- X86MemOperand x86memop,
- Operand intmemop, ComplexPattern int_cpat,
- Intrinsic Intr, SDNode OpNode, Domain d,
+ ValueType ScalarVT, X86MemOperand x86memop,
+ Operand intmemop, SDNode OpNode, Domain d,
X86FoldableSchedWrite sched, Predicate target> {
let hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
@@ -2849,14 +2863,8 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
let Predicates = [target] in {
def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r)
(ScalarVT (IMPLICIT_DEF)), RC:$src)>;
- def : Pat<(Intr VR128:$src),
- (!cast<Instruction>(NAME#r_Int) VR128:$src,
- VR128:$src)>;
}
let Predicates = [target, OptForSize] in {
- def : Pat<(Intr int_cpat:$src2),
- (!cast<Instruction>(NAME#m_Int)
- (vt (IMPLICIT_DEF)), addr:$src2)>;
def : Pat<(ScalarVT (OpNode (load addr:$src))),
(!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
addr:$src)>;
@@ -2935,29 +2943,32 @@ let Predicates = [HasAVX, NoVLX] in {
Sched<[sched.XMM.Folded]>;
}
+multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86SchedWriteWidths sched, Predicate AVXTarget> {
+ defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+ !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+ UseSSE1, "SS">, XS;
+ defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
+ !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+ AVXTarget>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
+}
+
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched, Predicate AVXTarget> {
- defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
- ssmem, sse_load_f32,
- !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
- SSEPackedSingle, sched.Scl, UseSSE1>, XS;
- defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
- f32mem, ssmem, sse_load_f32,
- !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
- SSEPackedSingle, sched.Scl, AVXTarget>, XS, VEX_4V,
- VEX_LIG, VEX_WIG;
+ defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
+ ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
+ defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
+ f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG;
}
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched, Predicate AVXTarget> {
- defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
- sdmem, sse_load_f64,
- !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
- OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
- defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
- f64mem, sdmem, sse_load_f64,
- !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
- OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
+ defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
+ sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
+ defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
+ f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
XD, VEX_4V, VEX_LIG, VEX_WIG;
}
@@ -2970,8 +2981,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
+ sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
+ sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
// There is no f64 version of the reciprocal approximation instructions.
@@ -3009,6 +3022,9 @@ multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNod
}
}
+defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
+
multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
SDNode Move, ValueType VT,
Predicate BasePredicate> {
@@ -3028,10 +3044,6 @@ defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
v4f32, UseSSE1>;
defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
v4f32, UseSSE1>;
-defm : scalar_unary_math_intr_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
- v4f32, UseSSE1>;
-defm : scalar_unary_math_intr_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
- v2f64, UseSSE2>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 78185f7e7ee..65954f01b81 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -318,8 +318,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx_round_pd_256, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx_round_ps_256, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
@@ -894,12 +892,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::SCALEFS, 0),
X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::SCALEFS, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
X86ISD::FSQRT_RND),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
X86ISD::FSQRT_RND),
X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
@@ -1289,7 +1283,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE),
X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT),
@@ -1345,7 +1338,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
- X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE),
X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT),
OpenPOWER on IntegriCloud