diff options
| author | Evandro Menezes <e.menezes@samsung.com> | 2016-10-24 16:14:58 +0000 |
|---|---|---|
| committer | Evandro Menezes <e.menezes@samsung.com> | 2016-10-24 16:14:58 +0000 |
| commit | eff2bd9d4f3633e8a8ad1dc778ab6159a4089b56 (patch) | |
| tree | 5851d4e00fe57776a9cae141557b036f59202a16 /llvm/lib/Target/AArch64 | |
| parent | 566eb864b670a960de4375432657aa24a9e88316 (diff) | |
| download | bcm5719-llvm-eff2bd9d4f3633e8a8ad1dc778ab6159a4089b56.tar.gz bcm5719-llvm-eff2bd9d4f3633e8a8ad1dc778ab6159a4089b56.zip | |
[AArch64] Optionally use the Newton series for reciprocal estimation
Add support for estimating the square root or its reciprocal and division or
reciprocal using the combiner generic Newton series.
Differential revision: https://reviews.llvm.org/D25291
llvm-svn: 284986
Diffstat (limited to 'llvm/lib/Target/AArch64')
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64.td | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 50 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.h | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 29 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64Subtarget.h | 2 |
5 files changed, 94 insertions, 0 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 2ff3cf45a84..4857a0433db 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -106,6 +106,10 @@ def FeatureDisableLatencySchedHeuristic : SubtargetFeature< "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; +def FeatureUseRSqrt : SubtargetFeature< + "use-reciprocal-square-root", "UseRSqrt", "true", + "Use the reciprocal square root approximation">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -227,6 +231,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, + FeatureUseRSqrt, FeatureZCZeroing ]>; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8ef6d555b74..d9e61aa936b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -959,6 +959,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; + case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; + case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; } return nullptr; } @@ -4608,6 +4610,54 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// +static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, + SDValue Operand, SelectionDAG &DAG, + int &ExtraSteps) { + EVT VT = Operand.getValueType(); + if (ST->hasNEON() && + (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || + VT == MVT::f32 || VT == MVT::v1f32 || + VT == MVT::v2f32 || VT == MVT::v4f32)) { + if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) + // For the reciprocal estimates, convergence is quadratic, so the number + // of digits is doubled after each iteration. In ARMv8, the accuracy of + // the initial estimate is 2^-8. Thus the number of extra steps to refine + // the result for float (23 mantissa bits) is 2 and for double (52 + // mantissa bits) is 3. + ExtraSteps = VT == MVT::f64 ? 3 : 2; + + return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); + } + + return SDValue(); +} + +SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, + SelectionDAG &DAG, int Enabled, + int &ExtraSteps, + bool &UseOneConst) const { + if (Enabled == ReciprocalEstimate::Enabled || + (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) + if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, + DAG, ExtraSteps)) { + UseOneConst = true; + return Estimate; + } + + return SDValue(); +} + +SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, + SelectionDAG &DAG, int Enabled, + int &ExtraSteps) const { + if (Enabled == ReciprocalEstimate::Enabled) + if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, + DAG, ExtraSteps)) + return Estimate; + + return SDValue(); +} + //===----------------------------------------------------------------------===// // AArch64 Inline Assembly Support //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 28d158cc66c..c9ae7cd94bc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -187,6 +187,10 @@ enum NodeType : unsigned { SMULL, UMULL, + // Reciprocal estimates. + FRECPE, + FRSQRTE, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, @@ -530,6 +534,10 @@ private: SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const override; + SDValue getRsqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &ExtraSteps, bool &UseOneConst) const override; + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &ExtraSteps) const override; unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index dad097e07ac..e8386a65325 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -286,6 +286,9 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>; def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; +def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>; +def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>; + def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; @@ -3406,6 +3409,19 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))), def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))), (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))), + (FRECPEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))), + (FRECPEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))), + (FRECPEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))), + (FRECPEv2f64 FPR128:$Rn)>; + def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))), (FRECPXv1i32 FPR32:$Rn)>; def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), @@ -3418,6 +3434,19 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))), def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))), (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))), + (FRSQRTEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))), + (FRSQRTEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))), + (FRSQRTEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))), + (FRSQRTEv2f64 FPR128:$Rn)>; + // If an integer is about to be converted to a floating point value, // just load it on the floating point unit. // Here are the patterns for 8 and 16-bits to float. diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index cdc5d741ae3..5612c1323bc 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -83,6 +83,7 @@ protected: bool HasArithmeticBccFusion = false; bool HasArithmeticCbzFusion = false; bool DisableLatencySchedHeuristic = false; + bool UseRSqrt = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; @@ -191,6 +192,7 @@ public: } bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; } bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } + bool useRSqrt() const { return UseRSqrt; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const { return VectorInsertExtractBaseCost; |

