diff options
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 36 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.h | 9 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 29 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetMachine.cpp | 29 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetMachine.h | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/recp-fastmath.ll | 79 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/sqrt-fastmath.ll | 158 | 
7 files changed, 339 insertions, 3 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 10a2b9e33ee..575f9d9fa5b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -970,6 +970,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {    case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";    case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";    case AArch64ISD::UMULL:             return "AArch64ISD::UMULL"; +  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE"; +  case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";    }    return nullptr;  } @@ -4624,6 +4626,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {  //                          AArch64 Optimization Hooks  //===----------------------------------------------------------------------===// +/// getEstimate - Return the appropriate estimate DAG for either the reciprocal +/// or the reciprocal square root. +static SDValue getEstimate(const AArch64Subtarget &ST, +  const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode, +  const SDValue &Operand, unsigned &ExtraSteps) { +  if (!ST.hasNEON()) +    return SDValue(); + +  EVT VT = Operand.getValueType(); + +  std::string RecipOp; +  RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt"; +  RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp; +  RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f"; + +  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals; +  if (!Recips.isEnabled(RecipOp)) +    return SDValue(); + +  ExtraSteps = Recips.getRefinementSteps(RecipOp); +  return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); +} + +SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, +  DAGCombinerInfo &DCI, unsigned &ExtraSteps) const { +  return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps); +} + +SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand, +  DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const { +  UseOneConst = true; +  return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps); +} +  //===----------------------------------------------------------------------===//  //                          AArch64 Inline Assembly Support  //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index cf1c12292fc..65e2614461d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -187,6 +187,10 @@ enum NodeType : unsigned {    SMULL,    UMULL, +  // Reciprocal estimates. +  FRECPE, +  FRSQRTE, +    // NEON Load/Store with post-increment base updates    LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,    LD3post, @@ -511,6 +515,11 @@ private:    SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,                          std::vector<SDNode *> *Created) const override; +  SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, +                           unsigned &RefinementSteps, +                           bool &UseOneConstNR) const override; +  SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, +                           unsigned &RefinementSteps) const override;    unsigned combineRepeatedFPDivisors() const override;    ConstraintType getConstraintType(StringRef Constraint) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 1d37b6ac39e..9e3954905ce 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -283,6 +283,9 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,  def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;  def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>; +def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>; +def AArch64frsqrte  : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>; +  def AArch64saddv    : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;  def AArch64uaddv    : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;  def AArch64sminv    : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; @@ -3401,6 +3404,19 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),  def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),            (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))), +          (FRECPEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))), +          (FRECPEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))), +          (FRECPEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))), +          (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))), +          (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))), +          (FRECPEv2f64 FPR128:$Rn)>; +  def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),            (FRECPXv1i32 FPR32:$Rn)>;  def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), @@ -3413,6 +3429,19 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),  def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),            (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))), +          (FRSQRTEv1i32 FPR32:$Rn)>; +def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))), +          (FRSQRTEv2f32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))), +          (FRSQRTEv4f32 FPR128:$Rn)>; +def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))), +          (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))), +          (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))), +          (FRSQRTEv2f64 FPR128:$Rn)>; +  // If an integer is about to be converted to a floating point value,  // just load it on the floating point unit.  // Here are the patterns for 8 and 16-bits to float. diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 33f65ceeae6..f184efecf3b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -136,6 +136,30 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) {    return "E-m:e-i64:64-i128:128-n32:64-S128";  } +// Helper function to set up the defaults for reciprocals. +static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST) +{ +  // For the estimates, convergence is quadratic, so essentially the number of +  // digits is doubled after each iteration. ARMv8, the minimum architected +  // accuracy of the initial estimate is 2^-8.  Therefore, the number of extra +  // steps to refine the result for float (23 mantissa bits) and for double +  // (52 mantissa bits) are 2 and 3, respectively. +  unsigned ExtraStepsF = 2, +           ExtraStepsD = ExtraStepsF + 1; +  // FIXME: Enable x^-1/2 only for Exynos M1 at the moment. +  bool UseRsqrt = ST.isExynosM1(); + +  TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF); +  TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD); +  TM.Options.Reciprocals.setDefaults("vec-sqrtf", UseRsqrt, ExtraStepsF); +  TM.Options.Reciprocals.setDefaults("vec-sqrtd", UseRsqrt, ExtraStepsD); + +  TM.Options.Reciprocals.setDefaults("divf", false, ExtraStepsF); +  TM.Options.Reciprocals.setDefaults("divd", false, ExtraStepsD); +  TM.Options.Reciprocals.setDefaults("vec-divf", false, ExtraStepsF); +  TM.Options.Reciprocals.setDefaults("vec-divd", false, ExtraStepsD); +} +  /// TargetMachine ctor - Create an AArch64 architecture model.  ///  AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, @@ -149,7 +173,8 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,      : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS,                          Options, RM, CM, OL),        TLOF(createTLOF(getTargetTriple())), -      isLittle(LittleEndian) { +      Subtarget(TT, CPU, FS, *this, LittleEndian) { +  initReciprocals(*this, Subtarget);    initAsmInfo();  } @@ -189,7 +214,7 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {      // function that reside in TargetOptions.      resetTargetOptions(F);      I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, -                                            isLittle); +                                            Subtarget.isLittleEndian());  #ifndef LLVM_BUILD_GLOBAL_ISEL     GISelAccessor *GISel = new GISelAccessor();  #else diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 8d49a29386a..aac98a205e0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -46,7 +46,7 @@ public:    }  private: -  bool isLittle; +  AArch64Subtarget Subtarget;  };  // AArch64leTargetMachine - AArch64 little endian target machine. diff --git a/llvm/test/CodeGen/AArch64/recp-fastmath.ll b/llvm/test/CodeGen/AArch64/recp-fastmath.ll new file mode 100644 index 00000000000..710739b2cc5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/recp-fastmath.ll @@ -0,0 +1,79 @@ +; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=!div,!vec-div | FileCheck %s --check-prefix=FAULT +; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=div,vec-div   | FileCheck %s + +define float @frecp(float %x) #0 { +  %div = fdiv fast float 1.0, %x +  ret float %div + +; FAULT-LABEL: frecp: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fmov +; FAULT-NEXT: fdiv + +; CHECK-LABEL: frecp: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: frecpe +; CHECK-NEXT: fmov +} + +define <2 x float> @f2recp(<2 x float> %x) #0 { +  %div = fdiv fast <2 x float> <float 1.0, float 1.0>, %x +  ret <2 x float> %div + +; FAULT-LABEL: f2recp: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fmov +; FAULT-NEXT: fdiv + +; CHECK-LABEL: f2recp: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frecpe +} + +define <4 x float> @f4recp(<4 x float> %x) #0 { +  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x +  ret <4 x float> %div + +; FAULT-LABEL: f4recp: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fmov +; FAULT-NEXT: fdiv + +; CHECK-LABEL: f4recp: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frecpe +} + +define double @drecp(double %x) #0 { +  %div = fdiv fast double 1.0, %x +  ret double %div + +; FAULT-LABEL: drecp: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fmov +; FAULT-NEXT: fdiv + +; CHECK-LABEL: drecp: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: frecpe +; CHECK-NEXT: fmov +} + +define <2 x double> @d2recp(<2 x double> %x) #0 { +  %div = fdiv fast <2 x double> <double 1.0, double 1.0>, %x +  ret <2 x double> %div + +; FAULT-LABEL: d2recp: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fmov +; FAULT-NEXT: fdiv + +; CHECK-LABEL: d2recp: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frecpe +} + +attributes #0 = { nounwind "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll new file mode 100644 index 00000000000..8bc85a5628c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll @@ -0,0 +1,158 @@ +; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=!sqrt,!vec-sqrt | FileCheck %s --check-prefix=FAULT +; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=sqrt,vec-sqrt   | FileCheck %s + +declare float @llvm.sqrt.f32(float) #1 +declare double @llvm.sqrt.f64(double) #1 +declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #1 +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1 +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #1 + +define float @fsqrt(float %a) #0 { +  %1 = tail call fast float @llvm.sqrt.f32(float %a) +  ret float %1 + +; FAULT-LABEL: fsqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: fsqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frsqrte +} + +define <2 x float> @f2sqrt(<2 x float> %a) #0 { +  %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #2 +  ret <2 x float> %1 + +; FAULT-LABEL: f2sqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: f2sqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: mov +; CHECK-NEXT: frsqrte +} + +define <4 x float> @f4sqrt(<4 x float> %a) #0 { +  %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #2 +  ret <4 x float> %1 + +; FAULT-LABEL: f4sqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: f4sqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: mov +; CHECK-NEXT: frsqrte +} + +define double @dsqrt(double %a) #0 { +  %1 = tail call fast double @llvm.sqrt.f64(double %a) +  ret double %1 + +; FAULT-LABEL: dsqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: dsqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frsqrte +} + +define <2 x double> @d2sqrt(<2 x double> %a) #0 { +  %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #2 +  ret <2 x double> %1 + +; FAULT-LABEL: d2sqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: d2sqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: mov +; CHECK-NEXT: frsqrte +} + +define float @frsqrt(float %a) #0 { +  %1 = tail call fast float @llvm.sqrt.f32(float %a) +  %2 = fdiv fast float 1.000000e+00, %1 +  ret float %2 + +; FAULT-LABEL: frsqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: frsqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frsqrte +} + +define <2 x float> @f2rsqrt(<2 x float> %a) #0 { +  %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #2 +  %2 = fdiv fast <2 x float> <float 1.000000e+00, float 1.000000e+00>, %1 +  ret <2 x float> %2 + +; FAULT-LABEL: f2rsqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: f2rsqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frsqrte +} + +define <4 x float> @f4rsqrt(<4 x float> %a) #0 { +  %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #2 +  %2 = fdiv fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %1 +  ret <4 x float> %2 + +; FAULT-LABEL: f4rsqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: f4rsqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frsqrte +} + +define double @drsqrt(double %a) #0 { +  %1 = tail call fast double @llvm.sqrt.f64(double %a) +  %2 = fdiv fast double 1.000000e+00, %1 +  ret double %2 + +; FAULT-LABEL: drsqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: drsqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frsqrte +} + +define <2 x double> @d2rsqrt(<2 x double> %a) #0 { +  %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #2 +  %2 = fdiv fast <2 x double> <double 1.000000e+00, double 1.000000e+00>, %1 +  ret <2 x double> %2 + +; FAULT-LABEL: d2rsqrt: +; FAULT-NEXT: BB#0 +; FAULT-NEXT: fsqrt + +; CHECK-LABEL: d2rsqrt: +; CHECK-NEXT: BB#0 +; CHECK-NEXT: fmov +; CHECK-NEXT: frsqrte +} + +attributes #0 = { nounwind "unsafe-fp-math"="true" }  | 

