diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/ARM/ARM.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrInfo.td | 7 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMInstrNEON.td | 9 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMScheduleA8.td | 19 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMScheduleA9.td | 36 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMScheduleV6.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/ARMSubtarget.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 2 | 
8 files changed, 73 insertions, 10 deletions
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index b05fe629b74..85c41fc75d4 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -76,8 +76,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",  def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",                                          "true",                                          "Use NEON for single precision FP">; -// Allow more precision in FP computation -def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;  // Disable 32-bit to 16-bit narrowing for experimentation.  def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 6b8f4cc4327..37284f979d4 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -181,11 +181,11 @@ def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,                                   AssemblerPredicate<"FeatureVFP3">;  def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,                                   AssemblerPredicate<"FeatureVFP4">; -def NoVFP4            : Predicate<"!Subtarget->hasVFP4()">; +def NoVFP4           : Predicate<"!Subtarget->hasVFP4()">;  def HasNEON          : Predicate<"Subtarget->hasNEON()">,                                   AssemblerPredicate<"FeatureNEON">;  def HasNEON2         : Predicate<"Subtarget->hasNEON2()">, -                                 AssemblerPredicate<"FeatureNEON2">; +                                 AssemblerPredicate<"FeatureNEON,FeatureVFP4">;  def NoNEON2          : Predicate<"!Subtarget->hasNEON2()">;  def HasFP16          : Predicate<"Subtarget->hasFP16()">,                                   AssemblerPredicate<"FeatureFP16">; @@ -221,6 +221,9 @@ def UseMovt          : Predicate<"Subtarget->useMovt()">;  def DontUseMovt      : Predicate<"!Subtarget->useMovt()">;  def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">; +// Allow more precision in FP computation +def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">; +  //===----------------------------------------------------------------------===//  // ARM Flag Definitions. diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index 99dbb95431a..501cc8f4db9 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4115,7 +4115,6 @@ defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,                              "vqdmlsl", "s", int_arm_neon_vqdmlsl>;  defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; -  // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.  def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",                            v2f32, fmul_su, fadd_mlx>, @@ -4136,10 +4135,10 @@ def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",  // Match @llvm.fma.* intrinsics  def : Pat<(fma (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm)),            (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>, -          Requires<[HasNEON, HasVFP4]>; +          Requires<[HasNEON2]>;  def : Pat<(fma (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm)),            (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>, -          Requires<[HasNEON, HasVFP4]>; +          Requires<[HasNEON2]>;  // Vector Subtract Operations. @@ -5497,9 +5496,9 @@ def : N3VSMulOpPat<fmul, fadd, VMLAfd>,  def : N3VSMulOpPat<fmul, fsub, VMLSfd>,        Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>;  def : N3VSMulOpPat<fmul, fadd, VFMAfd>, -      Requires<[HasNEON2, UseNEONForFP,FPContractions]>; +      Requires<[HasNEON2, UseNEONForFP, FPContractions]>;  def : N3VSMulOpPat<fmul, fsub, VFMSfd>, -      Requires<[HasNEON2, UseNEONForFP,FPContractions]>; +      Requires<[HasNEON2, UseNEONForFP, FPContractions]>;  def : N2VSPat<fabs, VABSfd>;  def : N2VSPat<fneg, VNEGfd>;  def : N3VSPat<NEONfmax, VMAXfd>; diff --git a/llvm/lib/Target/ARM/ARMScheduleA8.td b/llvm/lib/Target/ARM/ARMScheduleA8.td index 8d86c01dc74..8b1fb9386ad 100644 --- a/llvm/lib/Target/ARM/ARMScheduleA8.td +++ b/llvm/lib/Target/ARM/ARMScheduleA8.td @@ -324,6 +324,15 @@ def CortexA8Itineraries : ProcessorItineraries<                                 InstrStage<19, [A8_NPipe], 0>,                                 InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,    // +  // Single-precision Fused FP MAC +  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, +                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>, +  // +  // Double-precision Fused FP MAC +  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, +                               InstrStage<19, [A8_NPipe], 0>, +                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>, +  //    // Single-precision FP DIV    InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,                                 InstrStage<20, [A8_NPipe], 0>, @@ -860,6 +869,16 @@ def CortexA8Itineraries : ProcessorItineraries<    InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,                                 InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,    // +  // Double-register Fused FP Multiple-Accumulate +  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, +                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>, +  // +  // Quad-register Fused FP Multiple-Accumulate +  // Result written in N9, but that is relative to the last cycle of multicycle, +  // so we use 10 for those cases +  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>, +                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>, +  //    // Double-register Reciprical Step    InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,                                 InstrStage<1, [A8_NPipe]>], [9, 2, 2]>, diff --git a/llvm/lib/Target/ARM/ARMScheduleA9.td b/llvm/lib/Target/ARM/ARMScheduleA9.td index 49fedf63f8b..0d710cc1ace 100644 --- a/llvm/lib/Target/ARM/ARMScheduleA9.td +++ b/llvm/lib/Target/ARM/ARMScheduleA9.td @@ -604,6 +604,22 @@ def CortexA9Itineraries : ProcessorItineraries<                                 InstrStage<2,  [A9_NPipe]>],                                [9, 1, 1, 1]>,    // +  // Single-precision Fused FP MAC +  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, +                               InstrStage<1, [A9_MUX0], 0>, +                               InstrStage<1, [A9_DRegsVFP], 0, Required>, +                               InstrStage<9, [A9_DRegsN],   0, Reserved>, +                               InstrStage<1, [A9_NPipe]>], +                              [8, 1, 1, 1]>, +  // +  // Double-precision Fused FP MAC +  InstrItinData<IIC_fpFMAC64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>, +                               InstrStage<1,  [A9_MUX0], 0>, +                               InstrStage<1,  [A9_DRegsVFP], 0, Required>, +                               InstrStage<10, [A9_DRegsN],  0, Reserved>, +                               InstrStage<2,  [A9_NPipe]>], +                              [9, 1, 1, 1]>, +  //    // Single-precision FP DIV    InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,                                 InstrStage<1,  [A9_MUX0], 0>, @@ -1697,6 +1713,26 @@ def CortexA9Itineraries : ProcessorItineraries<                                 InstrStage<4, [A9_NPipe]>],                                [8, 4, 2, 1]>,    // +  // Double-register Fused FP Multiple-Accumulate +  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, +                               InstrStage<1, [A9_MUX0], 0>, +                               InstrStage<1, [A9_DRegsN],   0, Required>, +                               // Extra latency cycles since wbck is 7 cycles +                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>, +                               InstrStage<2, [A9_NPipe]>], +                              [6, 3, 2, 1]>, +  // +  // Quad-register Fused FP Multiple-Accumulate +  // Result written in N9, but that is relative to the last cycle of multicycle, +  // so we use 10 for those cases +  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>, +                               InstrStage<1, [A9_MUX0], 0>, +                               InstrStage<1, [A9_DRegsN],   0, Required>, +                               // Extra latency cycles since wbck is 9 cycles +                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>, +                               InstrStage<4, [A9_NPipe]>], +                              [8, 4, 2, 1]>, +  //    // Double-register Reciprical Step    InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,                                 InstrStage<1, [A9_MUX0], 0>, diff --git a/llvm/lib/Target/ARM/ARMScheduleV6.td b/llvm/lib/Target/ARM/ARMScheduleV6.td index 4d959f565e0..0ace9bc1796 100644 --- a/llvm/lib/Target/ARM/ARMScheduleV6.td +++ b/llvm/lib/Target/ARM/ARMScheduleV6.td @@ -243,6 +243,12 @@ def ARMV6Itineraries : ProcessorItineraries<    // Double-precision FP MAC    InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,    // +  // Single-precision Fused FP MAC +  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>, +  // +  // Double-precision Fused FP MAC +  InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>, +  //    // Single-precision FP DIV    InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,    // diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 3d9c03d5dd2..5cf54b94a8a 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -45,7 +45,7 @@ protected:    bool HasV6T2Ops;    bool HasV7Ops; -  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what +  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEON2 - Specify what    /// floating point ISAs are supported.    bool HasVFPv2;    bool HasVFPv3; diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 34dadf88238..8fa7378ffff 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -4659,6 +4659,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,          Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||          Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||          Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" || +        Mnemonic == "vfms" || Mnemonic == "vfnms" ||          (Mnemonic == "movs" && isThumb()))) {      Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);      CarrySetting = true; @@ -4702,6 +4703,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,        Mnemonic == "orr" || Mnemonic == "mvn" ||        Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||        Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" || +      Mnemonic == "vfm" || Mnemonic == "vfnm" ||        (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||                        Mnemonic == "mla" || Mnemonic == "smlal" ||                        Mnemonic == "umlal" || Mnemonic == "umull"))) {  | 

